def testBlockExtractorV2ForSimpleTableWithTwoColumns(self):

        values = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ]])

        sbe = BlockExtractorV2()
        blocks = sbe.extract_blocks(sheet, tags)
        HEADER = BlockTypePMF({BasicBlockType.HEADER: 1.0})
        VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0})

        for block in blocks:
            print(block)

        # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order
        b1 = SimpleBlock(HEADER, 0, 1, 0, 0)
        b2 = SimpleBlock(HEADER, 0, 0, 1, 3)  # Todo: This is not correct
        b3 = SimpleBlock(VALUE, 1, 1, 1, 3)

        assert blocks[0] == b1
        assert blocks[1] == b2
        assert blocks[2] == b3
Ejemplo n.º 2
0
    def testCRFClassificationForSimpleTableWithTwoColumns(self):

        crf = CRFCellClassifier()
        values = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        tags = crf.classify_cells(Sheet(values, None))

        expected_tags = np.array([[
            CellTypePMF({BasicCellType.META: 1}),
            CellTypePMF({BasicCellType.META: 1})
        ],
                                  [
                                      CellTypePMF({BasicCellType.DATE: 1}),
                                      CellTypePMF({BasicCellType.DATA: 1})
                                  ],
                                  [
                                      CellTypePMF({BasicCellType.DATE: 1}),
                                      CellTypePMF({BasicCellType.DATA: 1})
                                  ],
                                  [
                                      CellTypePMF({BasicCellType.DATE: 1}),
                                      CellTypePMF({BasicCellType.DATA: 1})
                                  ]])

        assert np.array_equal(tags, expected_tags)
Ejemplo n.º 3
0
    def testBlockExtractorForSimpleTableWithTwoColumns(self):

        values = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ]])

        sbe = ExampleBlockExtractor()
        blocks = sbe.extract_blocks(sheet, tags)

        # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order
        bc = BlockTypePMF({
            BasicBlockType.ATTRIBUTE: 0.9,
            BasicBlockType.HEADER: 0.1,
            # block_type.EMPTY: 0
        })

        b1 = SimpleBlock(bc, 0, 1, 0, 3)

        assert blocks[0] == b1
    def testExampleClassificationForSimpleTableWithTwoColumns(self):

        example = ExampleCellClassifier()
        values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']])
        tags = example.classify_cells(Sheet(values, None))

        print(tags)

        expected_tags = np.array([
                                    [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})],
                                    [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})],
                                    [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})],
                                    [CellTypePMF({cell_type.EMPTY: 1}), CellTypePMF({cell_type.EMPTY: 1})]
                                ])

        assert np.array_equal(tags, expected_tags)
Ejemplo n.º 5
0
    def testFeaturizerForMultiplesTables(self):

        # Table 1
        sheet1 = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']])
        sheet1 = Sheet(sheet1, None)
        tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]])

        b1_1 = SimpleBlock("META", 0, 1, 0, 0)
        b1_2 = SimpleBlock("DATE", 0, 0, 1, 3)
        b1_3 = SimpleBlock("_DATA_", 1, 1, 1, 3)
        blocks1 = [b1_1, b1_2, b1_3]

        # Table 2
        sheet2 = np.array([['date', 'value'], ['10.0', '2001'], ['11.0', '2002'], ['12.0', '2003']])
        tags2 = np.array([[CellTypePMF('META'), CellTypePMF('META')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')],
                          [CellTypePMF('_DATA_'), CellTypePMF('DATE')], [CellTypePMF('_DATA_'), CellTypePMF('DATE')]])
        b2_1 = SimpleBlock("META", 0, 1, 0, 0)
        b2_2 = SimpleBlock("_DATA_", 0, 0, 1, 3)
        b2_3 = SimpleBlock("DATE", 1, 1, 1, 3)
        blocks2 = [b2_1, b2_2, b2_3]

        featurizer = Featurize([sheet1, sheet2], [tags1, tags2], [blocks1, blocks2])
        input_features, _ = featurizer.get_input_features()

        print(input_features)

        # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]]), ([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])]
        assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]]), ([[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False]])]

        layoutGraph1 = LayoutGraph(blocks1)
        layoutGraph1.add_edge("header", 0, 1)
        layoutGraph1.add_edge("header", 0, 2)
        layoutGraph1.add_edge("meta", 1, 2)

        layoutGraph2 = LayoutGraph(blocks1)
        layoutGraph2.add_edge("header", 0, 1)
        layoutGraph2.add_edge("header", 0, 2)
        layoutGraph2.add_edge("meta", 2, 1)

        labels = featurizer.get_label_map([layoutGraph1, layoutGraph2])

        assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0], [1, 1, 0, 0, 0, 2]])
Ejemplo n.º 6
0
    def testBlockExtractorForSimpleTableWithTwoColumns(self):
        values = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[
            CellTypePMF({BasicCellType.META: 1}),
            CellTypePMF({BasicCellType.META: 1})
        ],
                         [
                             CellTypePMF({BasicCellType.DATE: 1}),
                             CellTypePMF({BasicCellType.DATA: 1})
                         ],
                         [
                             CellTypePMF({BasicCellType.DATE: 1}),
                             CellTypePMF({BasicCellType.DATA: 1})
                         ],
                         [
                             CellTypePMF({BasicCellType.DATE: 1}),
                             CellTypePMF({BasicCellType.DATA: 1})
                         ]])

        sbe = SimpleBlockExtractor()
        blocks = sbe.extract_blocks(sheet, tags)

        # Order of blocks in the list shouldn't actually matter. Write a better test to compare without any known order
        meta = BlockTypePMF({
            BasicBlockType.ATTRIBUTE: 1.0,
        })

        b1 = SimpleBlock("META", 0, 1, 0, 0)
        b2 = SimpleBlock("DATE", 0, 0, 1, 3)
        b3 = SimpleBlock("_DATA_", 1, 1, 1, 3)

        assert blocks[0] == b1
        assert blocks[1] == b2
        assert blocks[2] == b3
Ejemplo n.º 7
0
    def testFeaturizerForSimpleTableWithTwoColumns(self):

        sheet = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(sheet, None)
        tags = np.array([[CellTypePMF({cell_type.META: 1}), CellTypePMF({cell_type.META: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})],
                         [CellTypePMF({cell_type.DATE: 1}), CellTypePMF({cell_type.DATA: 1})]])

        ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0})
        VALUE = BlockTypePMF({block_type.VALUE: 1.0})

        b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3)  # Todo: This is not correct
        b3 = SimpleBlock(VALUE, 1, 1, 1, 3)

        blocks = [b1, b2, b3]

        featurizer = Featurize([sheet], [tags], [blocks])
        input_features, _ = featurizer.get_input_features()

        print(input_features)

        # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]])]

        #TODO: FIX THIS?
        # assert input_features == [([[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False]], [[0, 1], [0, 3], [0, 4], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5], [2, 1], [2, 3], [2, 4], [2, 5], [3, 0], [3, 1], [3, 2], [3, 4], [4, 0], [4, 2], [4, 3], [4, 5], [5, 0], [5, 1], [5, 2], [5, 4]], [[0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 0, 1, 0, 0, 1, 0, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False, 0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 0, 1, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 0, 1, 0, 1, 0, 0, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 0, 1, 0, 0, 0, 0, 1, True, True, True, False, False], [0, 1, 0, 0, 0, 0, 1, 0, True, True, True, True, False, 0, 1, 0, 0, 0, 0, 0, 1, True, True, True, False, False]])]

        layoutGraph = LayoutGraph(blocks)
        layoutGraph.add_edge(edge_type.HEADER, 0, 1)
        layoutGraph.add_edge(edge_type.HEADER, 0, 2)
        layoutGraph.add_edge(edge_type.ATTRIBUTE, 1, 2)

        labels = featurizer.get_label_map([layoutGraph])

        assert np.array_equal(labels, [[1, 1, 0, 2, 0, 0]])
Ejemplo n.º 8
0
    def testLayoutDetectionForSimpleTableWithTwoColumns(self):

        values = np.array([['date', 'value'], ['2001', '10.0'], ['2002', '11.0'], ['2003', '12.0']])
        sheet = Sheet(values, None)
        tags = np.array([[CellTypePMF({BasicCellType.META: 1}), CellTypePMF({BasicCellType.META: 1})],
                         [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})],
                         [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})],
                         [CellTypePMF({BasicCellType.DATE: 1}), CellTypePMF({BasicCellType.DATA: 1})]])

        ATTRIBUTE = BlockTypePMF({BasicBlockType.ATTRIBUTE: 1.0})
        VALUE = BlockTypePMF({BasicBlockType.VALUE: 1.0})

        b1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3)  # Todo: This is not correct
        b3 = SimpleBlock(VALUE, 1, 1, 1, 3)

        blocks = [b1, b2, b3]

        sld = ExampleLayoutDetector()
        layout = sld.detect_layout(sheet, tags, blocks)

        # TODO: The labels assigned to the edges here are actually wrong. Labels from block b1 should be headers.
        assert(layout.inEdges == [[], [], [(BasicEdgeType.ATTRIBUTE, 0), (BasicEdgeType.ATTRIBUTE, 1)]])
        assert(layout.outEdges == [[(BasicEdgeType.ATTRIBUTE, 2)], [(BasicEdgeType.ATTRIBUTE, 2)], []])
Ejemplo n.º 9
0
    def testCRFEstimator(self):

        ATTRIBUTE = BlockTypePMF({block_type.ATTRIBUTE: 1.0})
        VALUE = BlockTypePMF({block_type.VALUE: 1.0})

        # Table 1
        sheet1 = np.array([['date', 'value'], ['2001', '10.0'],
                           ['2002', '11.0'], ['2003', '12.0']])
        sheet1 = Sheet(sheet1, None)
        tags1 = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ], [
            CellTypePMF({cell_type.DATE: 1}),
            CellTypePMF({cell_type.DATA: 1})
        ]])

        b1_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b1_2 = SimpleBlock(ATTRIBUTE, 0, 0, 1, 3)  # Todo: This is not correct
        b1_3 = SimpleBlock(VALUE, 1, 1, 1, 3)
        blocks1 = [b1_1, b1_2, b1_3]

        # Table 2
        sheet2 = np.array([['date', 'value'], ['10.0', '2001'],
                           ['11.0', '2002'], ['12.0', '2003']])
        tags2 = np.array([[
            CellTypePMF({cell_type.META: 1}),
            CellTypePMF({cell_type.META: 1})
        ], [
            CellTypePMF({cell_type.DATA: 1}),
            CellTypePMF({cell_type.DATE: 1})
        ], [
            CellTypePMF({cell_type.DATA: 1}),
            CellTypePMF({cell_type.DATE: 1})
        ], [
            CellTypePMF({cell_type.DATA: 1}),
            CellTypePMF({cell_type.DATE: 1})
        ]])

        b2_1 = SimpleBlock(ATTRIBUTE, 0, 1, 0, 0)
        b2_2 = SimpleBlock(VALUE, 0, 0, 1, 3)
        b2_3 = SimpleBlock(ATTRIBUTE, 1, 1, 1, 3)
        blocks2 = [b2_1, b2_2, b2_3]

        layoutGraph1 = LayoutGraph(blocks1)
        layoutGraph1.add_edge(edge_type.HEADER, 0, 1)
        layoutGraph1.add_edge(edge_type.HEADER, 0, 2)
        layoutGraph1.add_edge(edge_type.ATTRIBUTE, 1, 2)

        layoutGraph2 = LayoutGraph(blocks1)
        layoutGraph2.add_edge(edge_type.HEADER, 0, 1)
        layoutGraph2.add_edge(edge_type.HEADER, 0, 2)
        layoutGraph2.add_edge(edge_type.ATTRIBUTE, 2, 1)

        estimator = CRFLayoutEstimator()
        estimator.set_input(
            [sheet1, sheet2, sheet1, sheet2], [tags1, tags2, tags1, tags2],
            [blocks1, blocks2, blocks1, blocks2],
            [layoutGraph1, layoutGraph2, layoutGraph1, layoutGraph2])

        crf_layout_detector = estimator.fit_crf()
Ejemplo n.º 10
0
 def get_sheet_by_index(self, idx) -> Sheet:
     values = self.wb.sheet_by_index(idx).to_array()
     values = np.array(values)
     return Sheet(values, None)
Ejemplo n.º 11
0
 def get_sheets(self) -> List[Sheet]:
     for name in self.wb.to_dict():
         values = self.wb[name].to_array()
         values = np.array(values)
         yield Sheet(values, None)
Ejemplo n.º 12
0
 def get_sheet_by_index(self, idx) -> Sheet:
     values = self.wb.sheet_by_index(idx).to_array()
     values = np.array(values)
     self.fill_merged_cells(values, self.wb_xlrd.sheet_by_index(idx).merged_cells)
     return Sheet(values, None)
Ejemplo n.º 13
0
 def get_sheets(self) -> List[Sheet]:
     for name in self.wb.to_dict():
         values = self.wb[name].to_array()
         values = np.array(values)
         self.fill_merged_cells(values, self.wb_xlrd.sheet_by_name(name).merged_cells)
         yield Sheet(values, {'name': name})