コード例 #1
0
    def testBlockExtractorV2ForSimpleTableWithTwoColumns(self):

        values = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ]])

        sbe = BlockExtractorV2()
        blocks = sbe.extract_blocks(sheet, tags)
        HEADER = BlockTypePMF({BasicBlockType.HEADER: 1.0})
        VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0})

        for block in blocks:
            print(block)

        # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order
        b1 = SimpleBlock(HEADER, 0, 1, 0, 0)
        b2 = SimpleBlock(HEADER, 0, 0, 1, 3)  # Todo: This is not correct
        b3 = SimpleBlock(VALUE, 1, 1, 1, 3)

        assert blocks[0] == b1
        assert blocks[1] == b2
        assert blocks[2] == b3
コード例 #2
0
def get_cell_distribution_of_block(tags: 'np.array[CellTypePMF]', block: SimpleBlock):
    count = dict()
    for i in range(block.get_top_row(), block.get_bottom_row() + 1):
        for j in range(block.get_left_col(), block.get_right_col() + 1):
            tag = tags[i][j].get_best_type()
            if tag not in count:
                count[tag] = 0
            count[tag] += 1

    return count
コード例 #3
0
    def merge_row_left_to_right(self, row_id, row, tags):
        curr_block_start = 0
        row_blocks = []
        for i in range(1, len(row)):
            if tags[i] != tags[i-1]:
                row_blocks.append(SimpleBlock(tags[i-1].get_best_type(), curr_block_start, i - 1, row_id, row_id))
                curr_block_start = i

        cols = len(row)
        row_blocks.append(SimpleBlock(tags[cols-1].get_best_type(), curr_block_start, cols - 1, row_id, row_id))
        return row_blocks
コード例 #4
0
    def get_header(self, sheet: Sheet, header_block: SimpleBlock, idx):
        if header_block is None:
            return "_" + str(idx)

        if header_block.left_col <= idx <= header_block.right_col:
            if header_block.get_height() == 2:
                return str(sheet.values[header_block.top_row][idx]) + "\n" +\
                        str(sheet.values[header_block.bottom_row][idx])
            elif header_block.get_height() == 1:
                return str(sheet.values[header_block.top_row][idx])

        return "_" + str(idx)
コード例 #5
0
    def merge_row_left_to_right(self, row_id, row, tags: List[CellTypePMF]):
        curr_block_start = 0
        row_blocks = []
        for i in range(1, len(row)):
            if tags[i].get_best_type() != tags[i - 1].get_best_type():
                # Appending a tuple (CellType, SimpleBlock), since block type is undetermined at this point
                row_blocks.append((tags[i - 1].get_best_type(),
                                   SimpleBlock(None, curr_block_start, i - 1,
                                               row_id, row_id)))
                curr_block_start = i

        cols = len(row)
        row_blocks.append((tags[cols - 1].get_best_type(),
                           SimpleBlock(None, curr_block_start, cols - 1,
                                       row_id, row_id)))
        return row_blocks
コード例 #6
0
    def testBlockExtractorForSimpleTableWithTwoColumns(self):

        values = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ]])

        sbe = ExampleBlockExtractor()
        blocks = sbe.extract_blocks(sheet, tags)

        # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order
        bc = BlockTypePMF({
            BasicBlockType.ATTRIBUTE: 0.9,
            BasicBlockType.HEADER: 0.1,
            # block_type.EMPTY: 0
        })

        b1 = SimpleBlock(bc, 0, 1, 0, 3)

        assert blocks[0] == b1
コード例 #7
0
    def get_blocks(self) -> List[SimpleBlock]:
        block_list = []
        block_id = 0
        for block_name in self.layout['layout']:
            if block_name == "value":
                _type = BasicBlockType.VALUE
            elif block_name == "title" or block_name == "comments":
                _type = BasicBlockType.GLOBAL_ATTRIBUTE
            elif block_name == "header":
                _type = BasicBlockType.HEADER
            else:
                _type = BasicBlockType.ATTRIBUTE

            block = self.layout['layout'][block_name]
            location = block['location']

            row_range, col_range = location.split(":")
            top_row, bottom_row = row_range.split("..")
            left_col, right_col = col_range.split("..")

            block_list.append(SimpleBlock(BlockTypePMF({_type: 1}), int(left_col), int(right_col), int(top_row), int(bottom_row)))
            self.block_idx[block_name] = block_id
            block_id += 1

        self.blocks = block_list
        return block_list
コード例 #8
0
    def add_mapping(self, label1, block1: SimpleBlock, label2,
                    block2: SimpleBlock):
        mapping_type = "dimension_mapping"

        mapping = dict()
        mapping['type'] = mapping_type

        mapped_dimension = -1
        if block1.are_blocks_vertical(block2):
            mapped_dimension = 1
        elif block1.are_blocks_horizontal(block2):
            mapped_dimension = 0

        mapping['value'] = "{}:{} <-> {}:{}".format(label1, mapped_dimension,
                                                    label2, mapped_dimension)

        self.annotation['relationships']['mappings'].append(mapping)
コード例 #9
0
    def get_block_relation_features(self, block1: SimpleBlock, block2: SimpleBlock):
        features = []

        # Add block 1 type
        features.extend([0] * BasicBlockType.block_type_count())
        features[block1.get_block_type().get_best_type().id()] = 1

        # Add block 2 type
        features.extend([0] * BasicBlockType.block_type_count())
        features[block2.get_block_type().get_best_type().id() + BasicBlockType.block_type_count()] = 1

        # Are 2 blocks adjacent
        features.append(block1.is_adjacent(block2))

        # Are 2 blocks separated by 1 row/column
        features.append(block1.are_blocks_within_x_row_or_column(2, block2))

        # Are 2 blocks separated by 4 rows/columns
        features.append(block1.are_blocks_within_x_row_or_column(5, block2))

        # Are 2 blocks horizontal
        features.append(block1.are_blocks_horizontal(block2))

        # Are 2 blocks vertical
        features.append(block1.are_blocks_vertical(block2))

        # Do the blocks have a block in between # cannot compute with this input

        # TODO: Does the block have any adjacent blocks? Important?

        return features
コード例 #10
0
    def extract_dataframe(self):
        ## Very simple dataframe extractor
        # Check if only one value block is present
        value_block = None
        value_block_count = 0
        for block in self.blocks:
            if block.get_block_type().get_best_type() == BasicBlockType.VALUE:
                value_block = block
                value_block_count += 1
        if value_block_count != 1:
            return None

        # Find left adjacent attribute block.
        attribute_block = None
        for block in self.blocks:
            if block.right_col + 1 == value_block.left_col and\
                    block.get_block_type().get_best_type() == BasicBlockType.ATTRIBUTE and\
                    abs(block.top_row - value_block.top_row) < 10 and\
                    abs(block.bottom_row - value_block.bottom_row) < 10:
                attribute_block = block
                break

        if attribute_block:
            # Merge two blocks together
            merged_block = SimpleBlock(None,
                                       attribute_block.left_col,
                                       value_block.right_col,
                                       max(attribute_block.top_row, value_block.top_row),
                                       min(attribute_block.bottom_row, value_block.bottom_row)
                                       )
        else:
            merged_block = value_block

        # Find header block
        header_block = None
        for block in self.blocks:
            if block.bottom_row + 1 == value_block.top_row and\
                    block.get_block_type().get_best_type() == BasicBlockType.ATTRIBUTE and\
                    abs(block.left_col - value_block.left_col) < 5 and\
                    abs(block.right_col - value_block.right_col) < 5:
                header_block = block
                break

        if header_block is not None and header_block.get_height() > 2:
            header_block = None

        dataframe = pd.DataFrame()

        for col in range(merged_block.left_col, merged_block.right_col + 1):
            header = self.get_header(self.sheet, header_block, col)
            data = []
            for row in range(merged_block.top_row, merged_block.bottom_row + 1):
                data.append(self.sheet.values[row][col])
            dataframe.loc[:, header] = data

        return dataframe
コード例 #11
0
    def testFaocommodityAnnotation(self):
        csv_reader = CsvReader('../../data/FAOSTAT_commodity.csv')
        sheet = csv_reader.get_sheet_by_index(0)

        yml = YAMLAnnotator()

        blocks = []
        blocks.append(SimpleBlock("META", 0, 14, 0, 0))
        blocks.append(SimpleBlock("META", 1, 1, 1, 232))  # domain
        blocks.append(SimpleBlock("META", 3, 3, 1, 232))  # area
        blocks.append(SimpleBlock("DATE", 5, 5, 1, 232))  # year
        blocks.append(SimpleBlock("META", 7, 7, 1, 232))  # item
        blocks.append(SimpleBlock("_DATA_", 11, 11, 1, 232))  # value

        layout = LayoutGraph(blocks)

        layout.add_edge("header", 0, 1)
        layout.add_edge("header", 0, 2)
        layout.add_edge("header", 0, 3)
        layout.add_edge("header", 0, 4)
        layout.add_edge("header", 0, 5)
        layout.add_edge("meta", 1, 5)
        layout.add_edge("meta", 2, 5)
        layout.add_edge("meta", 3, 5)
        layout.add_edge("meta", 4, 5)

        annotation = yml.get_annotation(0, sheet, None, blocks, layout)

        yml.write_yaml(annotation, "../../data/FAOSTAT_commodity.yaml")
コード例 #12
0
    def testSplit7Shape(self):
        bpmf = BlockTypePMF({BasicBlockType.ATTRIBUTE: 1})
        b1 = SimpleBlock(bpmf, 0, 1, 0, 0)
        b2 = SimpleBlock(bpmf, 2, 3, 0, 0)
        b3 = SimpleBlock(bpmf, 0, 0, 1, 1)
        b4 = SimpleBlock(bpmf, 1, 2, 1, 1)
        b5 = SimpleBlock(bpmf, 3, 3, 1, 2)
        b6 = SimpleBlock(bpmf, 0, 2, 2, 2)
        b7 = SimpleBlock(bpmf, 4, 5, 1, 1)

        a, b, c = split_7_shape(b1, b6)
        print(a, b, c)
コード例 #13
0
def split_7_shape(block_a: SimpleBlock, block_b: SimpleBlock):
    if not block_a.is_adjacent(block_b):
        return block_a, None, block_b
    b1, b2, b3 = None, None, None

    if block_a.right_col == block_b.right_col and block_a.left_col != block_b.left_col:
        if block_b.is_above(block_a):
            block_a, block_b = block_b, block_a

        b1 = SimpleBlock(block_a.block_type, block_a.left_col,
                         block_b.left_col - 1, block_a.top_row,
                         block_a.bottom_row)
        b2 = SimpleBlock(BlockTypePMF({BasicBlockType.HEADER: 1.0}),
                         block_b.left_col, block_b.right_col, block_a.top_row,
                         block_a.bottom_row)
        b3 = block_b

    if block_a.left_col == block_b.left_col and block_a.right_col != block_b.right_col:
        if block_b.is_above(block_a):
            block_a, block_b = block_b, block_a

        b1 = SimpleBlock(block_a.block_type, block_b.right_col + 1,
                         block_a.right_col, block_a.top_row,
                         block_a.bottom_row)
        b2 = SimpleBlock(BlockTypePMF({BasicBlockType.HEADER: 1.0}),
                         block_b.left_col, block_b.right_col, block_a.top_row,
                         block_a.bottom_row)
        b3 = block_b

    if block_a.top_row == block_b.top_row and block_a.bottom_row != block_b.bottom_row:
        if block_a.bottom_row > block_b.bottom_row:
            block_a, block_b = block_b, block_a

        b1 = block_a
        b2 = SimpleBlock(BlockTypePMF({BasicBlockType.HEADER: 1.0}),
                         block_b.left_col, block_b.right_col, block_a.top_row,
                         block_a.bottom_row)
        b3 = SimpleBlock(block_b.block_type, block_b.left_col,
                         block_b.right_col, block_a.bottom_row + 1,
                         block_b.bottom_row)

    return b1, b2, b3
コード例 #14
0
    def testLayoutDetectionForSimpleTableWithTwoColumns(self):

        values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[CellTypePMF({BasicCellType.META: 1}), CellTypePMF({BasicCellType.META: 1})],
                         [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})],
                         [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})],
                         [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})]])

        ATTRIBUTE = BlockTypePMF({BasicBlockType.ATTRIBUTE: 1.0})
        VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0})

        b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3)  # Todo: This is not correct
        b3 = SimpleBlock(VALUE, 1, 1, 1, 3)

        blocks = [b1, b2, b3]

        sld = ExampleLayoutDetector()
        layout = sld.detect_layout(sheet, tags, blocks)

        # TODO: The labels assigned to the edges here are actually wrong. Labels from block b1 should be headers.
        assert(layout.inEdges == [[], [], [(BasicEdgeType.ATTRIBUTE, 0), (BasicEdgeType.ATTRIBUTE, 1)]])
        assert(layout.outEdges == [[(BasicEdgeType.ATTRIBUTE, 2)], [(BasicEdgeType.ATTRIBUTE, 2)], []])
コード例 #15
0
    def testBlockExtractorForSimpleTableWithTwoColumns(self):
        values = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[
            CellTypePMF({BasicCellType.META: 1}),
            CellTypePMF({BasicCellType.META: 1})
        ],
                         [
                             CellTypePMF({BasicCellType.DATE: 1}),
                             CellTypePMF({BasicCellType.DATA: 1})
                         ],
                         [
                             CellTypePMF({BasicCellType.DATE: 1}),
                             CellTypePMF({BasicCellType.DATA: 1})
                         ],
                         [
                             CellTypePMF({BasicCellType.DATE: 1}),
                             CellTypePMF({BasicCellType.DATA: 1})
                         ]])

        sbe = SimpleBlockExtractor()
        blocks = sbe.extract_blocks(sheet, tags)

        # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order
        meta = BlockTypePMF({
            BasicBlockType.ATTRIBUTE: 1.0,
        })

        b1 = SimpleBlock("META", 0, 1, 0, 0)
        b2 = SimpleBlock("DATE", 0, 0, 1, 3)
        b3 = SimpleBlock("_DATA_", 1, 1, 1, 3)

        assert blocks[0] == b1
        assert blocks[1] == b2
        assert blocks[2] == b3
コード例 #16
0
    def testFeaturizerForSimpleTableWithTwoColumns(self):

        sheet = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(sheet, None)
        tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]])

        ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0})
        VALUE = BlockTypePMF({block_type.VALUE: 1.0})

        b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3)  # Todo: This is not correct
        b3 = SimpleBlock(VALUE, 1, 1, 1, 3)

        blocks = [b1, b2, b3]

        featurizer = Featurize([sheet], [tags], [blocks])
        input_features, _ = featurizer.get_input_features()

        print(input_features)

        # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])]

        #TODO: FIX THIS?
        # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]])]

        layoutGraph = LayoutGraph(blocks)
        layoutGraph.add_edge(edge_type.HEADER, 0, 1)
        layoutGraph.add_edge(edge_type.HEADER, 0, 2)
        layoutGraph.add_edge(edge_type.ATTRIBUTE, 1, 2)

        labels = featurizer.get_label_map([layoutGraph])

        assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0]])
コード例 #17
0
    def get_blocks(self, sheet) -> List[SimpleBlock]:
        blocks = sheet['blocks']
        blocklist = []

        for block_id in blocks:
            block_range, b_type = blocks[block_id].split("-")
            block_class = BlockTypePMF(
                {BasicBlockType.str_to_block_type[b_type + "_block"]: 1})
            top_row, left_col, bottom_row, right_col = excel_range2bbox(
                block_range)
            s = SimpleBlock(block_class, left_col, right_col, top_row,
                            bottom_row)

            blocklist.append(s)

        return blocklist
コード例 #18
0
    def extract_blocks(self, sheet: Sheet,
                       tags: 'np.array[CellTypePMF]') -> List[SimpleBlock]:
        # Get simple set of blocks from block extractor v2
        bev2 = BlockExtractorV2()
        row_blocks = bev2.merge_sheet_left_to_right(sheet, tags)
        maximal_blocks = bev2.merge_sheet_top_to_bottom(row_blocks)

        print("Maximal blocks extracted.")
        for cell_type, block in maximal_blocks:
            print(cell_type, block)

        row_h, col_h = self.get_hypotheses(maximal_blocks)
        print("Row hypotheses ", row_h)
        print("Column hypotheses ", col_h)

        max_row, max_col = sheet.values.shape
        start_block = SimpleBlock(None, 0, max_col - 1, 0,
                                  max_row - 1)  # TODO: Check if -1 is correct
        blocks = []

        q = Queue()
        q.put(start_block)

        while not q.empty():
            next_block = q.get()

            ## One more check : If only data and empty cells are in both blocks, then the split is not useful.
            ## TODO: Find a neater way to incorporate this into the system
            if not self.split_needed(next_block, maximal_blocks):
                blocks.append(next_block)
                continue

            split_blocks, gain = self.get_best_split(next_block,
                                                     maximal_blocks, row_h,
                                                     col_h)
            b1, b2 = split_blocks

            if (b1 and b2
                ) and gain >= self.threshold:  # Block was split into 2 blocks
                q.put(b1)
                q.put(b2)
            else:  # Block could not be split
                blocks.append(next_block)

        postprocess(tags, blocks)

        return blocks
コード例 #19
0
    def split_needed(self, block: SimpleBlock,
                     maximal_blocks: List[Tuple[CellType, SimpleBlock]]):
        dist = self.get_cell_distribution_of_split(block, maximal_blocks)

        value_and_empty = 0
        if BasicCellType.DATA in dist:
            value_and_empty += dist[BasicCellType.DATA]
        if BasicCellType.EMPTY in dist:
            value_and_empty += dist[BasicCellType.EMPTY]

        block_size = block.get_area()

        # Do not split blocks with only value and empty cells
        # Yet another hyperparameter
        if block_size - value_and_empty < 3:
            return False
        return True
コード例 #20
0
    def extract_blocks(self, sheet: Sheet,
                       tags: 'np.array[CellTypePMF]') -> List[SimpleBlock]:
        row_blocks = self.merge_sheet_left_to_right(sheet, tags)
        blocks = self.merge_sheet_top_to_bottom(row_blocks)

        new_blocks = []
        # Remove empty blocks
        for _type, block in blocks:
            if _type != "EMPTY":
                new_blocks.append((_type, block))

        blocks = new_blocks

        # Convert old block types to new block types
        new_blocks = []
        for old_type, block in blocks:
            new_type = cell_type_to_block_type_map[old_type]
            if new_type == BasicBlockType.EMPTY:
                continue

            # Attribute can be global, non-global or headers
            if new_type == BasicBlockType.ATTRIBUTE:
                adjacent_block_found = False
                for _, block2 in blocks:
                    if block != block2 and block.is_adjacent(block2):
                        adjacent_block_found = True
                        break
                if not adjacent_block_found:
                    new_type = BasicBlockType.GLOBAL_ATTRIBUTE
                else:
                    # TODO: Block size should not be the only indicator for classifying a block as header
                    block_size = (
                        block.get_right_col() - block.get_left_col() +
                        1) * (block.get_bottom_row() - block.get_top_row() + 1)
                    if block_size <= 5:
                        new_type = BasicBlockType.HEADER
                    else:
                        new_type = BasicBlockType.ATTRIBUTE  ## same as before

            new_blocks.append(
                SimpleBlock(BlockTypePMF({new_type: 1.0}),
                            block.get_left_col(), block.get_right_col(),
                            block.get_top_row(), block.get_bottom_row()))
        return new_blocks
コード例 #21
0
    def merge_sheet_top_to_bottom(self, row_blocks: List) -> List:
        blocks = []
        up = row_blocks[0]  # Blocks which might be merged with rows below
        for i in range(1, len(row_blocks)):
            down = row_blocks[i]

            j, k = 0, 0
            new_up = []
            while j < len(up) and k < len(down):
                if up[j][1].get_left_col() == down[k][1].get_left_col()\
                        and up[j][1].get_right_col() == down[k][1].get_right_col()\
                        and up[j][0] == down[k][0]:  # Same block type
                    # Merge two blocks
                    new_up.append((up[j][0],
                                   SimpleBlock(None, up[j][1].get_left_col(),
                                               up[j][1].get_right_col(),
                                               up[j][1].get_top_row(),
                                               down[k][1].get_bottom_row())))
                    j += 1
                    k += 1

                elif up[j][1].get_right_col() < down[k][1].get_right_col():
                    blocks.append(up[j])
                    j += 1

                elif down[k][1].get_right_col() < up[j][1].get_right_col():
                    new_up.append(down[k])
                    k += 1

                elif up[j][1].get_right_col() == down[k][1].get_right_col():
                    blocks.append(up[j])
                    new_up.append(down[k])
                    j += 1
                    k += 1
            up = new_up

        blocks.extend(up)  # Add whatevers left
        return blocks
コード例 #22
0
    def extract_blocks(self, sheet: Sheet,
                       tags: 'np.array[CellTypePMF]') -> List[SimpleBlock]:
        blocks = []

        # Probability distribution of block type
        bc = BlockTypePMF({
            BasicBlockType.ATTRIBUTE: 0.9,
            BasicBlockType.HEADER: 0.1,
            # block_type.EMPTY: 0
        })

        # bc = BlockTypePMF(
        #     {
        #         BasicBlockType.VALUE: 1.0
        #     }
        # )

        row, col = sheet.values.shape
        new_block = SimpleBlock(bc, 0, col - 1, 0, row - 1)

        blocks.append(new_block)

        return blocks
コード例 #23
0
    def testFeaturizerForMultiplesTables(self):

        # Table 1
        sheet1 = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']])
        sheet1 = Sheet(sheet1, None)
        tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]])

        b1_1 = SimpleBlock("META", 0, 1, 0, 0)
        b1_2 = SimpleBlock("DATE", 0, 0, 1, 3)
        b1_3 = SimpleBlock("_DATA_", 1, 1, 1, 3)
        blocks1 = [b1_1, b1_2, b1_3]

        # Table 2
        sheet2 = np.array([['date', 'value'], ['10.0', '2001'], ['11.0', '2002'], ['12.0', '2003']])
        tags2 = np.array([[CellTypePMF('META'), CellTypePMF('META')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')],
                          [CellTypePMF('_DATA_'), CellTypePMF('DATE')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')]])
        b2_1 = SimpleBlock("META", 0, 1, 0, 0)
        b2_2 = SimpleBlock("_DATA_", 0, 0, 1, 3)
        b2_3 = SimpleBlock("DATE", 1, 1, 1, 3)
        blocks2 = [b2_1, b2_2, b2_3]

        featurizer = Featurize([sheet1, sheet2], [tags1, tags2], [blocks1, blocks2])
        input_features, _ = featurizer.get_input_features()

        print(input_features)

        # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]]), ([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])]
        assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]]), ([[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False]])]

        layoutGraph1 = LayoutGraph(blocks1)
        layoutGraph1.add_edge("header", 0, 1)
        layoutGraph1.add_edge("header", 0, 2)
        layoutGraph1.add_edge("meta", 1, 2)

        layoutGraph2 = LayoutGraph(blocks1)
        layoutGraph2.add_edge("header", 0, 1)
        layoutGraph2.add_edge("header", 0, 2)
        layoutGraph2.add_edge("meta", 2, 1)

        labels = featurizer.get_label_map([layoutGraph1, layoutGraph2])

        assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0], [1, 1, 0, 0, 0, 2]])
コード例 #24
0
    def merge_sheet_top_to_bottom(self, row_blocks: List) -> List:
        blocks = []
        up = row_blocks[0]  # Blocks which might be merged with rows below
        for i in range(1, len(row_blocks)):
            down = row_blocks[i]

            j, k = 0, 0
            new_up = []
            # TODO: Verify correctness
            # TODO: Handle empty cells
            while j < len(up) and k < len(down):
                if up[j].get_left_col() == down[k].get_left_col() and up[j].get_right_col() == down[k].get_right_col()\
                        and up[j].get_block_type() == down[k].get_block_type():
                    # Merge two blocks
                    new_up.append(SimpleBlock(up[j].get_block_type(), up[j].get_left_col(), up[j].get_right_col(),
                                              up[j].get_top_row(), down[k].get_bottom_row()))
                    j += 1
                    k += 1

                elif up[j].get_right_col() < down[k].get_right_col():
                    blocks.append(up[j])
                    j += 1

                elif down[k].get_right_col() < up[j].get_right_col():
                    new_up.append(down[k])
                    k += 1

                elif up[j].get_right_col() == down[k].get_right_col():
                    blocks.append(up[j])
                    new_up.append(down[k])
                    j += 1
                    k += 1
            up = new_up

        blocks.extend(up)  # Add whatevers left
        return blocks
コード例 #25
0
    def testSimpleBlockFunctions(self):

        b1 = SimpleBlock(BasicBlockType.HEADER, 0, 1, 0, 0)
        b2 = SimpleBlock(BasicBlockType.ATTRIBUTE, 0, 0, 1, 3)
        b3 = SimpleBlock(BasicBlockType.VALUE, 1, 1, 1, 3)
        b4 = SimpleBlock(BasicBlockType.VALUE, 0, 1, 4, 4)
        b5 = SimpleBlock(BasicBlockType.VALUE, 2, 3, 1, 1)
        b6 = SimpleBlock(BasicBlockType.EMPTY, 0, 1, 1, 1)

        assert b1.is_adjacent(b2)
        assert b2.is_adjacent(b1)
        assert b2.is_adjacent(b3)
        assert b3.is_adjacent(b2)
        assert b1.is_adjacent(b3)
        assert b3.is_adjacent(b1)
        assert b1.is_above(b6)

        assert not b2.is_above(b3)
        assert not b3.is_above(b2)
        assert not b1.is_adjacent(b4)
        assert not b4.is_adjacent(b1)
        assert not b1.is_adjacent(b5)
        assert not b5.is_adjacent(b1)
コード例 #26
0
    def testCRFEstimator(self):

        ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0})
        VALUE = BlockTypePMF({block_type.VALUE: 1.0})

        # Table 1
        sheet1 = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet1 = Sheet(sheet1, None)
        tags1 = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ]])

        b1_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b1_2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3)  # Todo: This is not correct
        b1_3 = SimpleBlock(VALUE, 1, 1, 1, 3)
        blocks1 = [b1_1, b1_2, b1_3]

        # Table 2
        sheet2 = np.array([['date', 'value'], ['10.0', '2001'],
                           ['11.0', '2002'], ['12.0', '2003']])
        tags2 = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATA: 1}),
            CellTypePMF({cell_type.DATE: 1})
        ], [
            CellTypePMF({cell_type.DATA: 1}),
            CellTypePMF({cell_type.DATE: 1})
        ], [
            CellTypePMF({cell_type.DATA: 1}),
            CellTypePMF({cell_type.DATE: 1})
        ]])

        b2_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b2_2 = SimpleBlock(VALUE, 0, 0, 1, 3)
        b2_3 = SimpleBlock(ATTRIBUTE, 1, 1, 1, 3)
        blocks2 = [b2_1, b2_2, b2_3]

        layoutGraph1 = LayoutGraph(blocks1)
        layoutGraph1.add_edge(edge_type.HEADER, 0, 1)
        layoutGraph1.add_edge(edge_type.HEADER, 0, 2)
        layoutGraph1.add_edge(edge_type.ATTRIBUTE, 1, 2)

        layoutGraph2 = LayoutGraph(blocks1)
        layoutGraph2.add_edge(edge_type.HEADER, 0, 1)
        layoutGraph2.add_edge(edge_type.HEADER, 0, 2)
        layoutGraph2.add_edge(edge_type.ATTRIBUTE, 2, 1)

        estimator = CRFLayoutEstimator()
        estimator.set_input(
            [sheet1, sheet2, sheet1, sheet2], [tags1, tags2, tags1, tags2],
            [blocks1, blocks2, blocks1, blocks2],
            [layoutGraph1, layoutGraph2, layoutGraph1, layoutGraph2])

        crf_layout_detector = estimator.fit_crf()
コード例 #27
0
    def get_splits(self, block: SimpleBlock, row_h, col_h):

        for row in row_h:
            # if row >= block.get_top_row() and row < block.get_bottom_row():
            if block.get_top_row() <= row < block.get_bottom_row():
                b1 = SimpleBlock(None, block.get_left_col(),
                                 block.get_right_col(), block.get_top_row(),
                                 row)
                b2 = SimpleBlock(None, block.get_left_col(),
                                 block.get_right_col(), row + 1,
                                 block.get_bottom_row())

                yield b1, b2

        for col in col_h:
            # if col >= block.get_left_col() and col < block.get_right_col():
            if block.get_left_col() <= col < block.get_right_col():
                b1 = SimpleBlock(None, block.get_left_col(), col,
                                 block.get_top_row(), block.get_bottom_row())
                b2 = SimpleBlock(None, col + 1, block.get_right_col(),
                                 block.get_top_row(), block.get_bottom_row())

                yield b1, b2