def testBlockExtractorV2ForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ]]) sbe = BlockExtractorV2() blocks = sbe.extract_blocks(sheet, tags) HEADER = BlockTypePMF({BasicBlockType.HEADER: 1.0}) VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0}) for block in blocks: print(block) # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order b1 = SimpleBlock(HEADER, 0, 1, 0, 0) b2 = SimpleBlock(HEADER, 0, 0, 1, 3) # Todo: This is not correct b3 = SimpleBlock(VALUE, 1, 1, 1, 3) assert blocks[0] == b1 assert blocks[1] == b2 assert blocks[2] == b3
def get_cell_distribution_of_block(tags: 'np.array[CellTypePMF]', block: SimpleBlock): count = dict() for i in range(block.get_top_row(), block.get_bottom_row() + 1): for j in range(block.get_left_col(), block.get_right_col() + 1): tag = tags[i][j].get_best_type() if tag not in count: count[tag] = 0 count[tag] += 1 return count
def merge_row_left_to_right(self, row_id, row, tags): curr_block_start = 0 row_blocks = [] for i in range(1, len(row)): if tags[i] != tags[i-1]: row_blocks.append(SimpleBlock(tags[i-1].get_best_type(), curr_block_start, i - 1, row_id, row_id)) curr_block_start = i cols = len(row) row_blocks.append(SimpleBlock(tags[cols-1].get_best_type(), curr_block_start, cols - 1, row_id, row_id)) return row_blocks
def get_header(self, sheet: Sheet, header_block: SimpleBlock, idx): if header_block is None: return "_" + str(idx) if header_block.left_col <= idx <= header_block.right_col: if header_block.get_height() == 2: return str(sheet.values[header_block.top_row][idx]) + "\n" +\ str(sheet.values[header_block.bottom_row][idx]) elif header_block.get_height() == 1: return str(sheet.values[header_block.top_row][idx]) return "_" + str(idx)
def merge_row_left_to_right(self, row_id, row, tags: List[CellTypePMF]): curr_block_start = 0 row_blocks = [] for i in range(1, len(row)): if tags[i].get_best_type() != tags[i - 1].get_best_type(): # Appending a tuple (CellType, SimpleBlock), since block type is undetermined at this point row_blocks.append((tags[i - 1].get_best_type(), SimpleBlock(None, curr_block_start, i - 1, row_id, row_id))) curr_block_start = i cols = len(row) row_blocks.append((tags[cols - 1].get_best_type(), SimpleBlock(None, curr_block_start, cols - 1, row_id, row_id))) return row_blocks
def testBlockExtractorForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ]]) sbe = ExampleBlockExtractor() blocks = sbe.extract_blocks(sheet, tags) # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order bc = BlockTypePMF({ BasicBlockType.ATTRIBUTE: 0.9, BasicBlockType.HEADER: 0.1, # block_type.EMPTY: 0 }) b1 = SimpleBlock(bc, 0, 1, 0, 3) assert blocks[0] == b1
def get_blocks(self) -> List[SimpleBlock]: block_list = [] block_id = 0 for block_name in self.layout['layout']: if block_name == "value": _type = BasicBlockType.VALUE elif block_name == "title" or block_name == "comments": _type = BasicBlockType.GLOBAL_ATTRIBUTE elif block_name == "header": _type = BasicBlockType.HEADER else: _type = BasicBlockType.ATTRIBUTE block = self.layout['layout'][block_name] location = block['location'] row_range, col_range = location.split(":") top_row, bottom_row = row_range.split("..") left_col, right_col = col_range.split("..") block_list.append(SimpleBlock(BlockTypePMF({_type: 1}), int(left_col), int(right_col), int(top_row), int(bottom_row))) self.block_idx[block_name] = block_id block_id += 1 self.blocks = block_list return block_list
def add_mapping(self, label1, block1: SimpleBlock, label2, block2: SimpleBlock): mapping_type = "dimension_mapping" mapping = dict() mapping['type'] = mapping_type mapped_dimension = -1 if block1.are_blocks_vertical(block2): mapped_dimension = 1 elif block1.are_blocks_horizontal(block2): mapped_dimension = 0 mapping['value'] = "{}:{} <-> {}:{}".format(label1, mapped_dimension, label2, mapped_dimension) self.annotation['relationships']['mappings'].append(mapping)
def get_block_relation_features(self, block1: SimpleBlock, block2: SimpleBlock): features = [] # Add block 1 type features.extend([0] * BasicBlockType.block_type_count()) features[block1.get_block_type().get_best_type().id()] = 1 # Add block 2 type features.extend([0] * BasicBlockType.block_type_count()) features[block2.get_block_type().get_best_type().id() + BasicBlockType.block_type_count()] = 1 # Are 2 blocks adjacent features.append(block1.is_adjacent(block2)) # Are 2 blocks separated by 1 row/column features.append(block1.are_blocks_within_x_row_or_column(2, block2)) # Are 2 blocks separated by 4 rows/columns features.append(block1.are_blocks_within_x_row_or_column(5, block2)) # Are 2 blocks horizontal features.append(block1.are_blocks_horizontal(block2)) # Are 2 blocks vertical features.append(block1.are_blocks_vertical(block2)) # Do the blocks have a block in between # cannot compute with this input # TODO: Does the block have any adjacent blocks? Important? return features
def extract_dataframe(self): ## Very simple dataframe extractor # Check if only one value block is present value_block = None value_block_count = 0 for block in self.blocks: if block.get_block_type().get_best_type() == BasicBlockType.VALUE: value_block = block value_block_count += 1 if value_block_count != 1: return None # Find left adjacent attribute block. attribute_block = None for block in self.blocks: if block.right_col + 1 == value_block.left_col and\ block.get_block_type().get_best_type() == BasicBlockType.ATTRIBUTE and\ abs(block.top_row - value_block.top_row) < 10 and\ abs(block.bottom_row - value_block.bottom_row) < 10: attribute_block = block break if attribute_block: # Merge two blocks together merged_block = SimpleBlock(None, attribute_block.left_col, value_block.right_col, max(attribute_block.top_row, value_block.top_row), min(attribute_block.bottom_row, value_block.bottom_row) ) else: merged_block = value_block # Find header block header_block = None for block in self.blocks: if block.bottom_row + 1 == value_block.top_row and\ block.get_block_type().get_best_type() == BasicBlockType.ATTRIBUTE and\ abs(block.left_col - value_block.left_col) < 5 and\ abs(block.right_col - value_block.right_col) < 5: header_block = block break if header_block is not None and header_block.get_height() > 2: header_block = None dataframe = pd.DataFrame() for col in range(merged_block.left_col, merged_block.right_col + 1): header = self.get_header(self.sheet, header_block, col) data = [] for row in range(merged_block.top_row, merged_block.bottom_row + 1): data.append(self.sheet.values[row][col]) dataframe.loc[:, header] = data return dataframe
def testFaocommodityAnnotation(self): csv_reader = CsvReader('../../data/FAOSTAT_commodity.csv') sheet = csv_reader.get_sheet_by_index(0) yml = YAMLAnnotator() blocks = [] blocks.append(SimpleBlock("META", 0, 14, 0, 0)) blocks.append(SimpleBlock("META", 1, 1, 1, 232)) # domain blocks.append(SimpleBlock("META", 3, 3, 1, 232)) # area blocks.append(SimpleBlock("DATE", 5, 5, 1, 232)) # year blocks.append(SimpleBlock("META", 7, 7, 1, 232)) # item blocks.append(SimpleBlock("_DATA_", 11, 11, 1, 232)) # value layout = LayoutGraph(blocks) layout.add_edge("header", 0, 1) layout.add_edge("header", 0, 2) layout.add_edge("header", 0, 3) layout.add_edge("header", 0, 4) layout.add_edge("header", 0, 5) layout.add_edge("meta", 1, 5) layout.add_edge("meta", 2, 5) layout.add_edge("meta", 3, 5) layout.add_edge("meta", 4, 5) annotation = yml.get_annotation(0, sheet, None, blocks, layout) yml.write_yaml(annotation, "../../data/FAOSTAT_commodity.yaml")
def testSplit7Shape(self): bpmf = BlockTypePMF({BasicBlockType.ATTRIBUTE: 1}) b1 = SimpleBlock(bpmf, 0, 1, 0, 0) b2 = SimpleBlock(bpmf, 2, 3, 0, 0) b3 = SimpleBlock(bpmf, 0, 0, 1, 1) b4 = SimpleBlock(bpmf, 1, 2, 1, 1) b5 = SimpleBlock(bpmf, 3, 3, 1, 2) b6 = SimpleBlock(bpmf, 0, 2, 2, 2) b7 = SimpleBlock(bpmf, 4, 5, 1, 1) a, b, c = split_7_shape(b1, b6) print(a, b, c)
def split_7_shape(block_a: SimpleBlock, block_b: SimpleBlock): if not block_a.is_adjacent(block_b): return block_a, None, block_b b1, b2, b3 = None, None, None if block_a.right_col == block_b.right_col and block_a.left_col != block_b.left_col: if block_b.is_above(block_a): block_a, block_b = block_b, block_a b1 = SimpleBlock(block_a.block_type, block_a.left_col, block_b.left_col - 1, block_a.top_row, block_a.bottom_row) b2 = SimpleBlock(BlockTypePMF({BasicBlockType.HEADER: 1.0}), block_b.left_col, block_b.right_col, block_a.top_row, block_a.bottom_row) b3 = block_b if block_a.left_col == block_b.left_col and block_a.right_col != block_b.right_col: if block_b.is_above(block_a): block_a, block_b = block_b, block_a b1 = SimpleBlock(block_a.block_type, block_b.right_col + 1, block_a.right_col, block_a.top_row, block_a.bottom_row) b2 = SimpleBlock(BlockTypePMF({BasicBlockType.HEADER: 1.0}), block_b.left_col, block_b.right_col, block_a.top_row, block_a.bottom_row) b3 = block_b if block_a.top_row == block_b.top_row and block_a.bottom_row != block_b.bottom_row: if block_a.bottom_row > block_b.bottom_row: block_a, block_b = block_b, block_a b1 = block_a b2 = SimpleBlock(BlockTypePMF({BasicBlockType.HEADER: 1.0}), block_b.left_col, block_b.right_col, block_a.top_row, block_a.bottom_row) b3 = SimpleBlock(block_b.block_type, block_b.left_col, block_b.right_col, block_a.bottom_row + 1, block_b.bottom_row) return b1, b2, b3
def testLayoutDetectionForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[CellTypePMF({BasicCellType.META: 1}), CellTypePMF({BasicCellType.META: 1})], [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})], [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})], [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})]]) ATTRIBUTE = BlockTypePMF({BasicBlockType.ATTRIBUTE: 1.0}) VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0}) b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3) # Todo: This is not correct b3 = SimpleBlock(VALUE, 1, 1, 1, 3) blocks = [b1, b2, b3] sld = ExampleLayoutDetector() layout = sld.detect_layout(sheet, tags, blocks) # TODO: The labels assigned to the edges here are actually wrong. Labels from block b1 should be headers. assert(layout.inEdges == [[], [], [(BasicEdgeType.ATTRIBUTE, 0), (BasicEdgeType.ATTRIBUTE, 1)]]) assert(layout.outEdges == [[(BasicEdgeType.ATTRIBUTE, 2)], [(BasicEdgeType.ATTRIBUTE, 2)], []])
def testBlockExtractorForSimpleTableWithTwoColumns(self): values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(values, None) tags = np.array([[ CellTypePMF({BasicCellType.META: 1}), CellTypePMF({BasicCellType.META: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ], [ CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1}) ]]) sbe = SimpleBlockExtractor() blocks = sbe.extract_blocks(sheet, tags) # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order meta = BlockTypePMF({ BasicBlockType.ATTRIBUTE: 1.0, }) b1 = SimpleBlock("META", 0, 1, 0, 0) b2 = SimpleBlock("DATE", 0, 0, 1, 3) b3 = SimpleBlock("_DATA_", 1, 1, 1, 3) assert blocks[0] == b1 assert blocks[1] == b2 assert blocks[2] == b3
def testFeaturizerForSimpleTableWithTwoColumns(self): sheet = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet = Sheet(sheet, None) tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]]) ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0}) VALUE = BlockTypePMF({block_type.VALUE: 1.0}) b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3) # Todo: This is not correct b3 = SimpleBlock(VALUE, 1, 1, 1, 3) blocks = [b1, b2, b3] featurizer = Featurize([sheet], [tags], [blocks]) input_features, _ = featurizer.get_input_features() print(input_features) # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])] #TODO: FIX THIS? # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]])] layoutGraph = LayoutGraph(blocks) layoutGraph.add_edge(edge_type.HEADER, 0, 1) layoutGraph.add_edge(edge_type.HEADER, 0, 2) layoutGraph.add_edge(edge_type.ATTRIBUTE, 1, 2) labels = featurizer.get_label_map([layoutGraph]) assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0]])
def get_blocks(self, sheet) -> List[SimpleBlock]: blocks = sheet['blocks'] blocklist = [] for block_id in blocks: block_range, b_type = blocks[block_id].split("-") block_class = BlockTypePMF( {BasicBlockType.str_to_block_type[b_type + "_block"]: 1}) top_row, left_col, bottom_row, right_col = excel_range2bbox( block_range) s = SimpleBlock(block_class, left_col, right_col, top_row, bottom_row) blocklist.append(s) return blocklist
def extract_blocks(self, sheet: Sheet, tags: 'np.array[CellTypePMF]') -> List[SimpleBlock]: # Get simple set of blocks from block extractor v2 bev2 = BlockExtractorV2() row_blocks = bev2.merge_sheet_left_to_right(sheet, tags) maximal_blocks = bev2.merge_sheet_top_to_bottom(row_blocks) print("Maximal blocks extracted.") for cell_type, block in maximal_blocks: print(cell_type, block) row_h, col_h = self.get_hypotheses(maximal_blocks) print("Row hypotheses ", row_h) print("Column hypotheses ", col_h) max_row, max_col = sheet.values.shape start_block = SimpleBlock(None, 0, max_col - 1, 0, max_row - 1) # TODO: Check if -1 is correct blocks = [] q = Queue() q.put(start_block) while not q.empty(): next_block = q.get() ## One more check : If only data and empty cells are in both blocks, then the split is not useful. ## TODO: Find a neater way to incorporate this into the system if not self.split_needed(next_block, maximal_blocks): blocks.append(next_block) continue split_blocks, gain = self.get_best_split(next_block, maximal_blocks, row_h, col_h) b1, b2 = split_blocks if (b1 and b2 ) and gain >= self.threshold: # Block was split into 2 blocks q.put(b1) q.put(b2) else: # Block could not be split blocks.append(next_block) postprocess(tags, blocks) return blocks
def split_needed(self, block: SimpleBlock, maximal_blocks: List[Tuple[CellType, SimpleBlock]]): dist = self.get_cell_distribution_of_split(block, maximal_blocks) value_and_empty = 0 if BasicCellType.DATA in dist: value_and_empty += dist[BasicCellType.DATA] if BasicCellType.EMPTY in dist: value_and_empty += dist[BasicCellType.EMPTY] block_size = block.get_area() # Do not split blocks with only value and empty cells # Yet another hyperparameter if block_size - value_and_empty < 3: return False return True
def extract_blocks(self, sheet: Sheet, tags: 'np.array[CellTypePMF]') -> List[SimpleBlock]: row_blocks = self.merge_sheet_left_to_right(sheet, tags) blocks = self.merge_sheet_top_to_bottom(row_blocks) new_blocks = [] # Remove empty blocks for _type, block in blocks: if _type != "EMPTY": new_blocks.append((_type, block)) blocks = new_blocks # Convert old block types to new block types new_blocks = [] for old_type, block in blocks: new_type = cell_type_to_block_type_map[old_type] if new_type == BasicBlockType.EMPTY: continue # Attribute can be global, non-global or headers if new_type == BasicBlockType.ATTRIBUTE: adjacent_block_found = False for _, block2 in blocks: if block != block2 and block.is_adjacent(block2): adjacent_block_found = True break if not adjacent_block_found: new_type = BasicBlockType.GLOBAL_ATTRIBUTE else: # TODO: Block size should not be the only indicator for classifying a block as header block_size = ( block.get_right_col() - block.get_left_col() + 1) * (block.get_bottom_row() - block.get_top_row() + 1) if block_size <= 5: new_type = BasicBlockType.HEADER else: new_type = BasicBlockType.ATTRIBUTE ## same as before new_blocks.append( SimpleBlock(BlockTypePMF({new_type: 1.0}), block.get_left_col(), block.get_right_col(), block.get_top_row(), block.get_bottom_row())) return new_blocks
def merge_sheet_top_to_bottom(self, row_blocks: List) -> List: blocks = [] up = row_blocks[0] # Blocks which might be merged with rows below for i in range(1, len(row_blocks)): down = row_blocks[i] j, k = 0, 0 new_up = [] while j < len(up) and k < len(down): if up[j][1].get_left_col() == down[k][1].get_left_col()\ and up[j][1].get_right_col() == down[k][1].get_right_col()\ and up[j][0] == down[k][0]: # Same block type # Merge two blocks new_up.append((up[j][0], SimpleBlock(None, up[j][1].get_left_col(), up[j][1].get_right_col(), up[j][1].get_top_row(), down[k][1].get_bottom_row()))) j += 1 k += 1 elif up[j][1].get_right_col() < down[k][1].get_right_col(): blocks.append(up[j]) j += 1 elif down[k][1].get_right_col() < up[j][1].get_right_col(): new_up.append(down[k]) k += 1 elif up[j][1].get_right_col() == down[k][1].get_right_col(): blocks.append(up[j]) new_up.append(down[k]) j += 1 k += 1 up = new_up blocks.extend(up) # Add whatevers left return blocks
def extract_blocks(self, sheet: Sheet, tags: 'np.array[CellTypePMF]') -> List[SimpleBlock]: blocks = [] # Probability distribution of block type bc = BlockTypePMF({ BasicBlockType.ATTRIBUTE: 0.9, BasicBlockType.HEADER: 0.1, # block_type.EMPTY: 0 }) # bc = BlockTypePMF( # { # BasicBlockType.VALUE: 1.0 # } # ) row, col = sheet.values.shape new_block = SimpleBlock(bc, 0, col - 1, 0, row - 1) blocks.append(new_block) return blocks
def testFeaturizerForMultiplesTables(self): # Table 1 sheet1 = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet1 = Sheet(sheet1, None) tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})], [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]]) b1_1 = SimpleBlock("META", 0, 1, 0, 0) b1_2 = SimpleBlock("DATE", 0, 0, 1, 3) b1_3 = SimpleBlock("_DATA_", 1, 1, 1, 3) blocks1 = [b1_1, b1_2, b1_3] # Table 2 sheet2 = np.array([['date', 'value'], ['10.0', '2001'], ['11.0', '2002'], ['12.0', '2003']]) tags2 = np.array([[CellTypePMF('META'), CellTypePMF('META')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')]]) b2_1 = SimpleBlock("META", 0, 1, 0, 0) b2_2 = SimpleBlock("_DATA_", 0, 0, 1, 3) b2_3 = SimpleBlock("DATE", 1, 1, 1, 3) blocks2 = [b2_1, b2_2, b2_3] featurizer = Featurize([sheet1, sheet2], [tags1, tags2], [blocks1, blocks2]) input_features, _ = featurizer.get_input_features() print(input_features) # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]]), ([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])] assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]]), ([[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False]])] layoutGraph1 = LayoutGraph(blocks1) layoutGraph1.add_edge("header", 0, 1) layoutGraph1.add_edge("header", 0, 2) layoutGraph1.add_edge("meta", 1, 2) layoutGraph2 = LayoutGraph(blocks1) layoutGraph2.add_edge("header", 0, 1) layoutGraph2.add_edge("header", 0, 2) layoutGraph2.add_edge("meta", 2, 1) labels = featurizer.get_label_map([layoutGraph1, layoutGraph2]) assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0], [1, 1, 0, 0, 0, 2]])
def merge_sheet_top_to_bottom(self, row_blocks: List) -> List: blocks = [] up = row_blocks[0] # Blocks which might be merged with rows below for i in range(1, len(row_blocks)): down = row_blocks[i] j, k = 0, 0 new_up = [] # TODO: Verify correctness # TODO: Handle empty cells while j < len(up) and k < len(down): if up[j].get_left_col() == down[k].get_left_col() and up[j].get_right_col() == down[k].get_right_col()\ and up[j].get_block_type() == down[k].get_block_type(): # Merge two blocks new_up.append(SimpleBlock(up[j].get_block_type(), up[j].get_left_col(), up[j].get_right_col(), up[j].get_top_row(), down[k].get_bottom_row())) j += 1 k += 1 elif up[j].get_right_col() < down[k].get_right_col(): blocks.append(up[j]) j += 1 elif down[k].get_right_col() < up[j].get_right_col(): new_up.append(down[k]) k += 1 elif up[j].get_right_col() == down[k].get_right_col(): blocks.append(up[j]) new_up.append(down[k]) j += 1 k += 1 up = new_up blocks.extend(up) # Add whatevers left return blocks
def testSimpleBlockFunctions(self): b1 = SimpleBlock(BasicBlockType.HEADER, 0, 1, 0, 0) b2 = SimpleBlock(BasicBlockType.ATTRIBUTE, 0, 0, 1, 3) b3 = SimpleBlock(BasicBlockType.VALUE, 1, 1, 1, 3) b4 = SimpleBlock(BasicBlockType.VALUE, 0, 1, 4, 4) b5 = SimpleBlock(BasicBlockType.VALUE, 2, 3, 1, 1) b6 = SimpleBlock(BasicBlockType.EMPTY, 0, 1, 1, 1) assert b1.is_adjacent(b2) assert b2.is_adjacent(b1) assert b2.is_adjacent(b3) assert b3.is_adjacent(b2) assert b1.is_adjacent(b3) assert b3.is_adjacent(b1) assert b1.is_above(b6) assert not b2.is_above(b3) assert not b3.is_above(b2) assert not b1.is_adjacent(b4) assert not b4.is_adjacent(b1) assert not b1.is_adjacent(b5) assert not b5.is_adjacent(b1)
def testCRFEstimator(self): ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0}) VALUE = BlockTypePMF({block_type.VALUE: 1.0}) # Table 1 sheet1 = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']]) sheet1 = Sheet(sheet1, None) tags1 = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ], [ CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1}) ]]) b1_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b1_2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3) # Todo: This is not correct b1_3 = SimpleBlock(VALUE, 1, 1, 1, 3) blocks1 = [b1_1, b1_2, b1_3] # Table 2 sheet2 = np.array([['date', 'value'], ['10.0', '2001'], ['11.0', '2002'], ['12.0', '2003']]) tags2 = np.array([[ CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1}) ], [ CellTypePMF({cell_type.DATA: 1}), CellTypePMF({cell_type.DATE: 1}) ], [ CellTypePMF({cell_type.DATA: 1}), CellTypePMF({cell_type.DATE: 1}) ], [ CellTypePMF({cell_type.DATA: 1}), CellTypePMF({cell_type.DATE: 1}) ]]) b2_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0) b2_2 = SimpleBlock(VALUE, 0, 0, 1, 3) b2_3 = SimpleBlock(ATTRIBUTE, 1, 1, 1, 3) blocks2 = [b2_1, b2_2, b2_3] layoutGraph1 = LayoutGraph(blocks1) layoutGraph1.add_edge(edge_type.HEADER, 0, 1) layoutGraph1.add_edge(edge_type.HEADER, 0, 2) layoutGraph1.add_edge(edge_type.ATTRIBUTE, 1, 2) layoutGraph2 = LayoutGraph(blocks1) layoutGraph2.add_edge(edge_type.HEADER, 0, 1) layoutGraph2.add_edge(edge_type.HEADER, 0, 2) layoutGraph2.add_edge(edge_type.ATTRIBUTE, 2, 1) estimator = CRFLayoutEstimator() estimator.set_input( [sheet1, sheet2, sheet1, sheet2], [tags1, tags2, tags1, tags2], [blocks1, blocks2, blocks1, blocks2], [layoutGraph1, layoutGraph2, layoutGraph1, layoutGraph2]) crf_layout_detector = estimator.fit_crf()
def get_splits(self, block: SimpleBlock, row_h, col_h): for row in row_h: # if row >= block.get_top_row() and row < block.get_bottom_row(): if block.get_top_row() <= row < block.get_bottom_row(): b1 = SimpleBlock(None, block.get_left_col(), block.get_right_col(), block.get_top_row(), row) b2 = SimpleBlock(None, block.get_left_col(), block.get_right_col(), row + 1, block.get_bottom_row()) yield b1, b2 for col in col_h: # if col >= block.get_left_col() and col < block.get_right_col(): if block.get_left_col() <= col < block.get_right_col(): b1 = SimpleBlock(None, block.get_left_col(), col, block.get_top_row(), block.get_bottom_row()) b2 = SimpleBlock(None, col + 1, block.get_right_col(), block.get_top_row(), block.get_bottom_row()) yield b1, b2