コード例 #1
0
 def setUp(self):
     self.cell = Cell(tokens=[
         Token(text='hi',
               bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)),
         Token(text='bye',
               bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5))
     ],
                      rowspan=1,
                      colspan=1)
コード例 #2
0
 def test_insert_column(self):
     x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1)
     y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1)
     self.assertEqual(
         self.easy_table.insert_column(index=1, column=[x, y]),
         Table.create_from_grid(grid=[[self.a, x, self.b, self.c],
                                      [self.d, y, self.e, self.f]]))
     with self.assertRaises(Exception):
         self.easy_table.insert_column(index=1, column=[x, y, y])
コード例 #3
0
ファイル: xml_to_tables_parser.py プロジェクト: afcarl/corvid
    def _create_table_from_omnipage_xml(self, table_tag: Tag, caption: str,
                                        paper_id: str) -> Table:

        ncol = len(table_tag.find('gridtable').find_all('gridcol'))
        nrow = len(table_tag.find('gridtable').find_all('gridrow'))

        cells = []
        for cell_tag in table_tag.find_all('cellzone'):

            # BUILD LIST OF TOKENS
            tokens = []
            for word_tag in cell_tag.find_all('wd'):
                token = Token(text=word_tag.get_text(strip=True))
                tokens.append(token)

            # BUILD CELL FROM LIST OF TOKENS
            cell = Cell(tokens=tokens,
                        rowspan=int(cell_tag.get('gridrowtill')) -
                        int(cell_tag.get('gridrowfrom')) + 1,
                        colspan=int(cell_tag.get('gridcoltill')) -
                        int(cell_tag.get('gridcolfrom')) + 1)
            cells.append(cell)

        # BUILD TABLE FROM LIST OF CELLS
        table = Table.create_from_cells(cells=cells,
                                        nrow=nrow,
                                        ncol=ncol,
                                        paper_id=paper_id,
                                        page_num=0,
                                        caption=caption)

        return table
コード例 #4
0
 def test_compute_bounding_box(self):
     table = Table.create_from_cells(cells=[
         Cell(tokens=[Token(text='e')],
              rowspan=1,
              colspan=1,
              bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)),
         Cell(tokens=[Token(text='e')],
              rowspan=1,
              colspan=1,
              bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5))
     ],
                                     nrow=1,
                                     ncol=2,
                                     paper_id='abc',
                                     page_num=0,
                                     caption='hi this is caption')
     box = table.bounding_box
     self.assertEqual(box.ll.x, -1.0)
     self.assertEqual(box.ll.y, -0.5)
     self.assertEqual(box.ur.x, 2.5)
     self.assertEqual(box.ur.y, 1.5)
コード例 #5
0
ファイル: xml_to_tables_parser.py プロジェクト: afcarl/corvid
    def _create_table_from_tetml(self, table_id: int, table_tag: Tag,
                                 paper_id: str, caption: str) -> Table:
        cells = []
        ncol_per_row = []
        for i, row_tag in enumerate(table_tag.find_all('row')):

            ncol_per_row.append(0)
            for cell_tag in row_tag.find_all('cell'):

                # BUILD LIST OF TOKENS
                tokens = []
                for word_tag in cell_tag.find_all('word'):
                    word_box_tag = word_tag.find('box')
                    token = Token(
                        text=word_box_tag.get_text(strip=True),
                        # `find_all` gets font per character,
                        # but use `find` because assume font
                        # is constant within same word
                        font=word_box_tag.find('glyph').get('font'),
                        bounding_box=Box(llx=float(word_box_tag.get('llx')),
                                         lly=float(word_box_tag.get('lly')),
                                         urx=float(word_box_tag.get('urx')),
                                         ury=float(word_box_tag.get('ury'))))
                    tokens.append(token)

                # BUILD CELL FROM LIST OF TOKENS
                cell = Cell(
                    tokens=tokens,
                    rowspan=1,
                    colspan=int(cell_tag.get('colspan')) \
                        if cell_tag.get('colspan') else 1
                )
                cells.append(cell)
                ncol_per_row[i] += cell.colspan

        # TODO: add more filters here if necessary
        if not all([ncol == ncol_per_row[0] for ncol in ncol_per_row]):
            raise TetmlXMLToTablesParserException(
                'Table {} has unequal columns per row. Skipping...'.format(
                    table_id))

        # TODO: `page_num` and `paper_id` fields
        # BUILD TABLE FROM LIST OF CELLS
        table = Table.create_from_cells(cells=cells,
                                        nrow=len(ncol_per_row),
                                        ncol=ncol_per_row[0],
                                        paper_id=paper_id,
                                        page_num=0,
                                        caption=caption)
        return table
コード例 #6
0
    def test_compute_metrics(self):
        pred_table_missing_header = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
            ], nrow=3, ncol=3)

        with self.assertRaises(Exception):
            compute_metrics(gold_table=self.gold_table,
                            pred_table=pred_table_missing_header)

        self.assertEqual(
            cell_level_recall(gold_table=self.gold_table,
                              pred_table=self.pred_table_empty), 0.0)

        pred_table_permuted_header = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)

        with self.assertRaises(Exception):
            compute_metrics(gold_table=self.gold_table,
                            pred_table=pred_table_permuted_header)
コード例 #7
0
    def setUp(self):
        """
        gold:
            subject, header1, header2
            x, 1, 2
            y, 3, 4
            z, 5, 6
        """
        self.gold_table = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)

        self.gold_table_empty = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
            ], nrow=1, ncol=3)

        self.pred_table_perfect = self.gold_table

        self.pred_table_empty = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
            ], nrow=1, ncol=3)

        self.pred_table_permute_rows = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)

        self.pred_table_extra_rows = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='w')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='7')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='8')], rowspan=1, colspan=1)
            ], nrow=5, ncol=3)

        self.pred_table_missing_rows = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
            ], nrow=3, ncol=3)

        self.pred_table_partial_credit = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)
コード例 #8
0
    def test_count_matching_cells(self):
        self.assertEqual(count_matching_cells(
            row1=[
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1)
            ],
            row2=[
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1)
            ]), 3.0)

        self.assertEqual(count_matching_cells(
            row1=[
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1)
            ],
            row2=[
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1)
            ]), 1.0)
コード例 #9
0
    def test_improper_table(self):
        # misspecified nrow or ncol
        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=1,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
            ],
                                    nrow=1,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        # not enough cells to fill out table
        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        # cell juts out of table boundaries
        with self.assertRaises(Exception):
            Table.create_from_cells(
                cells=[Cell(tokens=[Token(text='a')], rowspan=1, colspan=2)],
                nrow=1,
                ncol=1,
                paper_id='',
                page_num=0,
                caption='')
コード例 #10
0
    def setUp(self):
        self.a = Cell(tokens=[Token(text='a')], rowspan=1, colspan=1)
        self.b = Cell(tokens=[Token(text='b')], rowspan=1, colspan=1)
        self.c = Cell(tokens=[Token(text='c')], rowspan=1, colspan=1)
        self.d = Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
        self.e = Cell(tokens=[Token(text='e')], rowspan=1, colspan=1)
        self.f = Cell(tokens=[Token(text='f')], rowspan=1, colspan=1)
        self.easy_table = Table(caption='hi this is caption')
        self.easy_table.grid = np.array([[self.a, self.b, self.c],
                                         [self.d, self.e, self.f]])

        self.hard_table = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='')], rowspan=2, colspan=2),
            Cell(tokens=[Token(text='C')], rowspan=1, colspan=2),
            Cell(tokens=[Token(text='C:1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='C:2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R')], rowspan=3, colspan=1),
            Cell(tokens=[Token(text='R:1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R:2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='d')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R:3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='e')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='f')], rowspan=1, colspan=1)
        ],
                                                  nrow=5,
                                                  ncol=4,
                                                  paper_id='abc',
                                                  page_num=0,
                                                  caption='hi this is caption')
コード例 #11
0
ファイル: test_schema_matcher.py プロジェクト: afcarl/corvid
 def setUp(self):
     self.table_source = Table.create_from_cells([
         Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
     ],
                                                 nrow=4,
                                                 ncol=3)
コード例 #12
0
ファイル: test_schema_matcher.py プロジェクト: afcarl/corvid
    def setUp(self):
        self.table_source = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
        ],
                                                    nrow=4,
                                                    ncol=3)

        self.table_less_header = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                         nrow=4,
                                                         ncol=2)

        self.table_more_header = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                         nrow=4,
                                                         ncol=4)

        self.table_permute_header = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                            nrow=4,
                                                            ncol=3)

        self.table_no_header = Table.create_from_cells([
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                       nrow=3,
                                                       ncol=3)

        self.table_only_header = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                         nrow=1,
                                                         ncol=3)
コード例 #13
0
ファイル: test_schema_matcher.py プロジェクト: afcarl/corvid
    def test_aggregate_tables(self):
        schema_matcher = SchemaMatcher()

        target_schema = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1)
        ],
                                                nrow=2,
                                                ncol=3)

        pred_aggregate_table = schema_matcher.aggregate_tables(
            pairwise_mappings=[
                PairwiseMapping(self.table_source,
                                target_schema,
                                score=-999,
                                column_mappings=[(1, 2), (2, 1)])
            ],
            target_schema=target_schema)

        gold_aggregate_table = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1)
        ],
                                                       nrow=4,
                                                       ncol=3)

        print(pred_aggregate_table)
        print(gold_aggregate_table)
        self.assertEquals(pred_aggregate_table, gold_aggregate_table)
コード例 #14
0
ファイル: test_schema_matcher.py プロジェクト: afcarl/corvid
    def setUp(self):
        self.table_permute_rows = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1)
        ],
                                                          nrow=4,
                                                          ncol=3)

        self.table_extra_rows = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='w')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='7')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='8')], rowspan=1, colspan=1)
        ],
                                                        nrow=5,
                                                        ncol=3)

        self.table_missing_rows = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                          nrow=3,
                                                          ncol=3)
コード例 #15
0
ファイル: test_schema_matcher.py プロジェクト: afcarl/corvid
    def test_map_tables(self):
        target_schema_easy = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                     nrow=1,
                                                     ncol=3)

        target_schema_less = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                     nrow=1,
                                                     ncol=2)

        target_schema_more = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header0')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                     nrow=1,
                                                     ncol=4)

        target_schema_permuted = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1)
        ],
                                                         nrow=1,
                                                         ncol=3)

        schema_matcher = ColNameSchemaMatcher()
        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_easy),
            [
                PairwiseMapping(self.table_source,
                                target_schema_easy,
                                score=2.0,
                                column_mappings=[(1, 1), (2, 2)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_permuted),
            [
                PairwiseMapping(self.table_source,
                                target_schema_permuted,
                                score=2.0,
                                column_mappings=[(1, 2), (2, 1)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_more),
            [
                PairwiseMapping(self.table_source,
                                target_schema_more,
                                score=2.0,
                                column_mappings=[(1, 2), (2, 3)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_less),
            [
                PairwiseMapping(self.table_source,
                                target_schema_less,
                                score=1.0,
                                column_mappings=[(2, 1)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[
                self.table_source, self.table_less_header,
                self.table_more_header
            ],
                                      target_schema=target_schema_permuted),
            [
                PairwiseMapping(self.table_source,
                                target_schema_permuted,
                                score=2.0,
                                column_mappings=[(1, 2), (2, 1)]),
                PairwiseMapping(self.table_less_header,
                                target_schema_permuted,
                                score=1.0,
                                column_mappings=[(1, 1)]),
                PairwiseMapping(self.table_more_header,
                                target_schema_permuted,
                                score=2.0,
                                column_mappings=[(1, 1), (2, 2)]),
            ])