Ejemplo n.º 1
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """

        # Textline labels
        #  Begin Inside End Single Other
        lLabels_BIESO = ['B', 'I', 'E', 'S', 'O']

        # Grid lines:
        #  Border Ignore Separator Outside
        lLabels_BISO_Grid = ['B', 'I', 'S', 'O']

        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = GraphGrid_H

        DU_GRAPH.iGridStep_H = cls.iGridStep_H
        DU_GRAPH.iGridStep_V = cls.iGridStep_V
        DU_GRAPH.iGridVisibility = cls.iGridVisibility
        DU_GRAPH.iBlockVisibility = cls.iBlockVisibility

        DU_GRAPH.BBoxDeltaFun = staticmethod(lambda v: v / cls.fDyRatio)
        traceln(" - grid computation: BBoxDeltaFun = lambda v: v / %f" %
                cls.fDyRatio)

        # ROW
        ntR = NodeType_PageXml_type_woText(
            "row",
            lLabels_BIESO,
            None,
            False,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)))
        ntR.setLabelAttribute("DU_row")
        ntR.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                         )
        DU_GRAPH.addNodeType(ntR)

        # HEADER
        ntGH = NodeType_PageXml_type_woText(
            "gh",
            lLabels_BISO_Grid,
            None,
            False,
            None  # equiv. to: BBoxDeltaFun=lambda _: 0
        )
        ntGH.setLabelAttribute("type")
        ntGH.setXpathExpr((
            './/pc:GridSeparator[@orient="0"]'  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                          )
        DU_GRAPH.addNodeType(ntGH)

        DU_GRAPH.setClassicNodeTypeList([ntR])

        return DU_GRAPH
Ejemplo n.º 2
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """
        
        lLabelsBIEOS_R  = ['B', 'I', 'E', 'S', 'O']  #O?
        #lLabelsSM_C     = ['M', 'S', 'O']   # single cell, multicells
#         lLabels_OI      = ['O','I']   # inside/outside a table           
#         lLabels_SPAN    = ['rspan','cspan','nospan','OTHER']
        lLabels_COLUMN_HEADER  = ['CH', 'D', 'O',]
        
#         """
#         if you play with a toy collection, which does not have all expected classes, you can reduce those.
#         """
#         
#         lActuallySeen = None
#         if lActuallySeen:
#             print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
#             lIgnoredLabels  = [lLabelsR[i] for i in range(len(lLabelsR)) if i not in lActuallySeen]
#             lLabels         = [lLabelsR[i] for i in lActuallySeen ]
#             print( len(lLabelsR)          , lLabelsR)
#             print( len(lIgnoredLabels)   , lIgnoredLabels)
        
        #DEFINING THE CLASS OF GRAPH WE USE
        if cls.bScaffold is None: raise Exception("Internal error")
        if cls.bScaffold:
            DU_GRAPH = FactorialGraph_MultiPageXml_Scaffold
        else:
            DU_GRAPH = FactorialGraph_MultiPageXml
        
        # ROW
        ntR = NodeType_PageXml_type_woText("row"
                              , lLabelsBIEOS_R
                              , None
                              , False    #no label means OTHER
                              , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))  #we reduce overlap in this way
                              )
        ntR.setLabelAttribute("DU_row")
        ntR.setXpathExpr( (".//pc:TextLine"        #how to find the nodes
                          , "./pc:TextEquiv")       #how to get their text
                       )
        DU_GRAPH.addNodeType(ntR)
        
        # HEADER
        ntH = NodeType_PageXml_type_woText("hdr"
                              , lLabels_COLUMN_HEADER
                              , None
                              , False    #no label means OTHER
                              , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))  #we reduce overlap in this way
                              )
        ntH.setLabelAttribute("DU_header")
        ntH.setXpathExpr( (".//pc:TextLine"        #how to find the nodes
                          , "./pc:TextEquiv")       #how to get their text
                       )
        DU_GRAPH.addNodeType(ntH)        
        
        
        return DU_GRAPH
Ejemplo n.º 3
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """
        lLabels = ['B', 'O', 'I']

        lIgnoredLabels = None
        """
        if you play with a toy collection, which does not have all expected classes, you can reduce those.
        """

        lActuallySeen = None
        if lActuallySeen:
            print("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
            lIgnoredLabels = [
                lLabels[i] for i in range(len(lLabels))
                if i not in lActuallySeen
            ]
            lLabels = [lLabels[i] for i in lActuallySeen]
            print(len(lLabels), lLabels)
            print(len(lIgnoredLabels), lIgnoredLabels)

        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = Graph_MultiSinglePageXml
        nt = NodeType_PageXml_type_woText(
            "abp"  #some short prefix because labels below are prefixed with it
            ,
            lLabels,
            lIgnoredLabels,
            False  #no label means OTHER
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)
                                       )  #we reduce overlap in this way
        )
        nt.setLabelAttribute("DU_row")

        # ntA = NodeType_PageXml_type_woText("abp"                   #some short prefix because labels below are prefixed with it
        #                       , lLabels
        #                       , lIgnoredLabels
        #                       , False    #no label means OTHER
        #                       )

        nt.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                        )

        # ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion"        #how to find the nodes
        #                   , "./pc:TextEquiv")       #how to get their text
        #                 )
        DU_GRAPH.addNodeType(nt)

        return DU_GRAPH
Ejemplo n.º 4
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """

        # Textline labels
        #  Begin Inside End Single Other
        lLabels_BIESO = ['B', 'I', 'E', 'S', 'O']

        # Cut lines:
        #  Border Ignore Separator Outside
        lLabels_SIO_Cut = ['S', 'I', 'O']

        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = GraphCut_H

        DU_GRAPH.iBlockVisibility = cls.iBlockVisibility
        DU_GRAPH.iLineVisibility = cls.iLineVisibility

        # ROW
        ntR = NodeType_PageXml_type_woText(
            "row",
            lLabels_BIESO,
            None,
            False,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)))
        ntR.setLabelAttribute("DU_row")
        ntR.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                         )
        DU_GRAPH.addNodeType(ntR)

        # HEADER
        ntCutH = NodeType_PageXml_type_woText(
            "sepH",
            lLabels_SIO_Cut,
            None,
            False,
            None  # equiv. to: BBoxDeltaFun=lambda _: 0
        )
        ntCutH.setLabelAttribute("type")
        ntCutH.setXpathExpr((
            './/pc:CutSeparator[@orient="0"]'  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                            )
        DU_GRAPH.addNodeType(ntCutH)

        DU_GRAPH.setClassicNodeTypeList([ntR])

        return DU_GRAPH
Ejemplo n.º 5
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """

        lLabels_COLUMN_HEADER = [
            'CH',
            'D',
            'O',
        ]

        #         """
        #         if you play with a toy collection, which does not have all expected classes, you can reduce those.
        #         """
        #
        #         lActuallySeen = None
        #         if lActuallySeen:
        #             print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
        #             lIgnoredLabels  = [lLabelsR[i] for i in range(len(lLabelsR)) if i not in lActuallySeen]
        #             lLabels         = [lLabelsR[i] for i in lActuallySeen ]
        #             print( len(lLabelsR)          , lLabelsR)
        #             print( len(lIgnoredLabels)   , lIgnoredLabels)

        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = Graph_MultiContinousPageXml

        # HEADER
        ntH = NodeType_PageXml_type_woText(
            "hdr",
            lLabels_COLUMN_HEADER,
            None,
            False  #no label means OTHER
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)
                                       )  #we reduce overlap in this way
        )
        ntH.setLabelAttribute("DU_header")
        ntH.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                         )
        DU_GRAPH.addNodeType(ntH)

        return DU_GRAPH
Ejemplo n.º 6
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """
        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = Graph_MultiPageXml

        lLabels1 = [
            'heading', 'header', 'page-number', 'resolution-number',
            'resolution-marginalia', 'resolution-paragraph', 'other'
        ]

        #the converter changed to other unlabelled TextRegions or 'marginalia' TRs
        lIgnoredLabels1 = None
        """
        if you play with a toy collection, which does not have all expected classes, you can reduce those.
        """

        #         lActuallySeen = None
        #         if lActuallySeen:
        #             print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
        #             lIgnoredLabels  = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
        #             lLabels         = [lLabels[i] for i in lActuallySeen ]
        #             print( len(lLabels)          , lLabels)
        #             print( len(lIgnoredLabels)   , lIgnoredLabels)

        nt1 = NodeType_PageXml_type_woText(
            "sem"  #some short prefix because labels below are prefixed with it
            ,
            lLabels1,
            lIgnoredLabels1,
            False  #no label means OTHER
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)
                                       )  #we reduce overlap in this way
        )
        nt1.setLabelAttribute("DU_sem")
        nt1.setXpathExpr((
            ".//pc:TextRegion"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                         )
        DU_GRAPH.addNodeType(nt1)

        return DU_GRAPH
Ejemplo n.º 7
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """
        
        # Textline labels
        #  Begin Inside End Single Other
        lLabels_BIESO           = ['B', 'I', 'E', 'S', 'O'] 
        lLabels_COLUMN_HEADER   = ['CH', 'D', 'O',]

        # Cut lines: 
        #  Border Ignore Separator Outside
        lLabels_SIO_Cut  = ['S', 'I', 'O']
       
        #DEFINING THE CLASS OF GRAPH WE USE
        # this is an ad-hoc class where type1 and type2 are factorial, while type3 is artificial object
        DU_GRAPH = AdHocFactorialGraphCut_H
        
        DU_GRAPH.iBlockVisibility   = cls.iBlockVisibility
        DU_GRAPH.iLineVisibility    = cls.iLineVisibility
        
        # ROW
        ntR = NodeType_PageXml_type_woText("row"
                              , lLabels_BIESO
                              , None
                              , False
                              , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))
                              )
        ntR.setLabelAttribute("DU_row")
        ntR.setXpathExpr( (".//pc:TextLine"        #how to find the nodes
                          , "./pc:TextEquiv")       #how to get their text
                       )
        DU_GRAPH.addNodeType(ntR)

        # HEADER
        ntH = NodeType_PageXml_type_woText("hdr"
                              , lLabels_COLUMN_HEADER
                              , None
                              , False    #no label means OTHER
                              , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))  #we reduce overlap in this way
                              )
        ntH.setLabelAttribute("DU_header")
        ntH.setXpathExpr( (".//pc:TextLine"        #how to find the nodes
                          , "./pc:TextEquiv")       #how to get their text
                       )
        DU_GRAPH.addNodeType(ntH) 
                
        # HEADER
        ntCutH = NodeType_PageXml_type_woText("sepH"
                              , lLabels_SIO_Cut
                              , None
                              , False
                              , None        # equiv. to: BBoxDeltaFun=lambda _: 0
                              )
        ntCutH.setLabelAttribute("type")
        ntCutH.setXpathExpr( ('.//pc:CutSeparator[@orient="0"]'        #how to find the nodes
                          , "./pc:TextEquiv")       #how to get their text
                       )
        DU_GRAPH.addNodeType(ntCutH)        
        
        # The nodes of this type (called "classic") are directly extracted from the XML
        # the other types of nodes are computed
        DU_GRAPH.setClassicNodeTypeList([ntR])
        DU_GRAPH.setSpecialNodeTypeList([ntCutH])
        DU_GRAPH.setFactoredClassicalType(ntR, ntH)  # make ntH a factorial of ntR
        
        return DU_GRAPH
Ejemplo n.º 8
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """

        lLabelsBIEOS_R = ['RB', 'RI', 'RE', 'RS']  #O?
        lLabelsSM_C = ['CM', 'CS']  # single cell, multicells
        #         lLabels_OI      = ['O','I']   # inside/outside a table
        #         lLabels_SPAN    = ['rspan','cspan','nospan','OTHER']
        #         lLabels_HEADER  = ['O','I']

        lIgnoredLabels = None

        #         """
        #         if you play with a toy collection, which does not have all expected classes, you can reduce those.
        #         """
        #
        #         lActuallySeen = None
        #         if lActuallySeen:
        #             print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
        #             lIgnoredLabels  = [lLabelsR[i] for i in range(len(lLabelsR)) if i not in lActuallySeen]
        #             lLabels         = [lLabelsR[i] for i in lActuallySeen ]
        #             print( len(lLabelsR)          , lLabelsR)
        #             print( len(lIgnoredLabels)   , lIgnoredLabels)

        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = FactorialGraph_MultiContinuousPageXml

        # ROW
        ntR = NodeType_PageXml_type_woText(
            "abpr"  #some short prefix because labels below are prefixed with it
            ,
            lLabelsBIEOS_R,
            None,
            True  #no label means OTHER
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)
                                       )  #we reduce overlap in this way
        )
        ntR.setLabelAttribute("DU_row")
        ntR.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                         )
        DU_GRAPH.addNodeType(ntR)

        # COLUMN
        ntC = NodeType_PageXml_type_woText(
            "abpc"  #some short prefix because labels below are prefixed with it
            ,
            lLabelsSM_C,
            None,
            True  #no label means OTHER
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)
                                       )  #we reduce overlap in this way
        )
        ntC.setLabelAttribute("DU_col")
        ntC.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                         )
        DU_GRAPH.addNodeType(ntC)

        return DU_GRAPH
Ejemplo n.º 9
0
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """
        
        # Textline labels
        #  Begin Inside End Single Other
        lLabels_BIESO  = ['B', 'I', 'E', 'S', 'O'] 

        # Grid lines: 
        #  Border Ignore Separator Outside
        lLabels_BISO_Grid  = ['B', 'I', 'S', 'O']
       
        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = GraphGrid_H
        
        DU_GRAPH.iGridStep_H        = cls.iGridStep_H
        DU_GRAPH.iGridStep_V        = cls.iGridStep_V
        DU_GRAPH.iGridVisibility    = cls.iGridVisibility
        DU_GRAPH.iBlockVisibility   = cls.iBlockVisibility
        
        # ROW
        ntR = NodeType_PageXml_type_woText("row"
                              , lLabels_BIESO
                              , None
                              , False
                              
                              #HISTORICAL FUNCTION IS (idiotic I think...):
                              #, BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))
                              
                              , BBoxDeltaFun=lambda v: v / 5.0,  #keep 2/3rd of the box  
                              # we reduce overlap in this way
                              #this function returns the amount by which each border of
                              # a bounding box is "shifted toward its centre"...
                              #     w,h = x2-x1, y2-y1
                              #     dx = self.BBoxDeltaFun(w)
                              #     dy = self.BBoxDeltaFun(h)
                              #     x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                              )
        ntR.setLabelAttribute("DU_row")
        ntR.setXpathExpr( (".//pc:TextLine"        #how to find the nodes
                          , "./pc:TextEquiv")       #how to get their text
                       )
        DU_GRAPH.addNodeType(ntR)
        
        # HEADER
        ntGH = NodeType_PageXml_type_woText("gh"
                              , lLabels_BISO_Grid
                              , None
                              , False
                              , None        # equiv. to: BBoxDeltaFun=lambda _: 0
                              )
        ntGH.setLabelAttribute("type")
        ntGH.setXpathExpr( ('.//pc:GridSeparator[@orient="0"]'        #how to find the nodes
                          , "./pc:TextEquiv")       #how to get their text
                       )
        DU_GRAPH.addNodeType(ntGH)        
        
        DU_GRAPH.setClassicNodeTypeList( [ntR ])
        
        return DU_GRAPH