def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ # Textline labels # Begin Inside End Single Other lLabels_BIESO = ['B', 'I', 'E', 'S', 'O'] # Grid lines: # Border Ignore Separator Outside lLabels_BISO_Grid = ['B', 'I', 'S', 'O'] #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = GraphGrid_H DU_GRAPH.iGridStep_H = cls.iGridStep_H DU_GRAPH.iGridStep_V = cls.iGridStep_V DU_GRAPH.iGridVisibility = cls.iGridVisibility DU_GRAPH.iBlockVisibility = cls.iBlockVisibility DU_GRAPH.BBoxDeltaFun = staticmethod(lambda v: v / cls.fDyRatio) traceln(" - grid computation: BBoxDeltaFun = lambda v: v / %f" % cls.fDyRatio) # ROW ntR = NodeType_PageXml_type_woText( "row", lLabels_BIESO, None, False, BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3))) ntR.setLabelAttribute("DU_row") ntR.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntR) # HEADER ntGH = NodeType_PageXml_type_woText( "gh", lLabels_BISO_Grid, None, False, None # equiv. to: BBoxDeltaFun=lambda _: 0 ) ntGH.setLabelAttribute("type") ntGH.setXpathExpr(( './/pc:GridSeparator[@orient="0"]' #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntGH) DU_GRAPH.setClassicNodeTypeList([ntR]) return DU_GRAPH
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ lLabelsBIEOS_R = ['B', 'I', 'E', 'S', 'O'] #O? #lLabelsSM_C = ['M', 'S', 'O'] # single cell, multicells # lLabels_OI = ['O','I'] # inside/outside a table # lLabels_SPAN = ['rspan','cspan','nospan','OTHER'] lLabels_COLUMN_HEADER = ['CH', 'D', 'O',] # """ # if you play with a toy collection, which does not have all expected classes, you can reduce those. # """ # # lActuallySeen = None # if lActuallySeen: # print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING") # lIgnoredLabels = [lLabelsR[i] for i in range(len(lLabelsR)) if i not in lActuallySeen] # lLabels = [lLabelsR[i] for i in lActuallySeen ] # print( len(lLabelsR) , lLabelsR) # print( len(lIgnoredLabels) , lIgnoredLabels) #DEFINING THE CLASS OF GRAPH WE USE if cls.bScaffold is None: raise Exception("Internal error") if cls.bScaffold: DU_GRAPH = FactorialGraph_MultiPageXml_Scaffold else: DU_GRAPH = FactorialGraph_MultiPageXml # ROW ntR = NodeType_PageXml_type_woText("row" , lLabelsBIEOS_R , None , False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way ) ntR.setLabelAttribute("DU_row") ntR.setXpathExpr( (".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntR) # HEADER ntH = NodeType_PageXml_type_woText("hdr" , lLabels_COLUMN_HEADER , None , False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way ) ntH.setLabelAttribute("DU_header") ntH.setXpathExpr( (".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntH) return DU_GRAPH
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ lLabels = ['B', 'O', 'I'] lIgnoredLabels = None """ if you play with a toy collection, which does not have all expected classes, you can reduce those. """ lActuallySeen = None if lActuallySeen: print("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING") lIgnoredLabels = [ lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen ] lLabels = [lLabels[i] for i in lActuallySeen] print(len(lLabels), lLabels) print(len(lIgnoredLabels), lIgnoredLabels) #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = Graph_MultiSinglePageXml nt = NodeType_PageXml_type_woText( "abp" #some short prefix because labels below are prefixed with it , lLabels, lIgnoredLabels, False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3) ) #we reduce overlap in this way ) nt.setLabelAttribute("DU_row") # ntA = NodeType_PageXml_type_woText("abp" #some short prefix because labels below are prefixed with it # , lLabels # , lIgnoredLabels # , False #no label means OTHER # ) nt.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) # ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion" #how to find the nodes # , "./pc:TextEquiv") #how to get their text # ) DU_GRAPH.addNodeType(nt) return DU_GRAPH
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ # Textline labels # Begin Inside End Single Other lLabels_BIESO = ['B', 'I', 'E', 'S', 'O'] # Cut lines: # Border Ignore Separator Outside lLabels_SIO_Cut = ['S', 'I', 'O'] #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = GraphCut_H DU_GRAPH.iBlockVisibility = cls.iBlockVisibility DU_GRAPH.iLineVisibility = cls.iLineVisibility # ROW ntR = NodeType_PageXml_type_woText( "row", lLabels_BIESO, None, False, BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3))) ntR.setLabelAttribute("DU_row") ntR.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntR) # HEADER ntCutH = NodeType_PageXml_type_woText( "sepH", lLabels_SIO_Cut, None, False, None # equiv. to: BBoxDeltaFun=lambda _: 0 ) ntCutH.setLabelAttribute("type") ntCutH.setXpathExpr(( './/pc:CutSeparator[@orient="0"]' #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntCutH) DU_GRAPH.setClassicNodeTypeList([ntR]) return DU_GRAPH
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ lLabels_COLUMN_HEADER = [ 'CH', 'D', 'O', ] # """ # if you play with a toy collection, which does not have all expected classes, you can reduce those. # """ # # lActuallySeen = None # if lActuallySeen: # print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING") # lIgnoredLabels = [lLabelsR[i] for i in range(len(lLabelsR)) if i not in lActuallySeen] # lLabels = [lLabelsR[i] for i in lActuallySeen ] # print( len(lLabelsR) , lLabelsR) # print( len(lIgnoredLabels) , lIgnoredLabels) #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = Graph_MultiContinousPageXml # HEADER ntH = NodeType_PageXml_type_woText( "hdr", lLabels_COLUMN_HEADER, None, False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3) ) #we reduce overlap in this way ) ntH.setLabelAttribute("DU_header") ntH.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntH) return DU_GRAPH
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = Graph_MultiPageXml lLabels1 = [ 'heading', 'header', 'page-number', 'resolution-number', 'resolution-marginalia', 'resolution-paragraph', 'other' ] #the converter changed to other unlabelled TextRegions or 'marginalia' TRs lIgnoredLabels1 = None """ if you play with a toy collection, which does not have all expected classes, you can reduce those. """ # lActuallySeen = None # if lActuallySeen: # print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING") # lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen] # lLabels = [lLabels[i] for i in lActuallySeen ] # print( len(lLabels) , lLabels) # print( len(lIgnoredLabels) , lIgnoredLabels) nt1 = NodeType_PageXml_type_woText( "sem" #some short prefix because labels below are prefixed with it , lLabels1, lIgnoredLabels1, False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3) ) #we reduce overlap in this way ) nt1.setLabelAttribute("DU_sem") nt1.setXpathExpr(( ".//pc:TextRegion" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(nt1) return DU_GRAPH
# , lIgnoredLabels # , False #no label means OTHER # ) ntA = NodeType_PageXml_type_woText("abp" #some short prefix because labels below are prefixed with it , lLabels , lIgnoredLabels , False #no label means OTHER ) # nt.setXpathExpr( (".//pc:TextLine" #how to find the nodes # , "./pc:TextEquiv") #how to get their text # ) ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion | .//pc:SeparatorRegion" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) # =============================================================================================================== class DU_ABPTableAnnotator(DU_CRF_Task): """ """ sXmlFilenamePattern = "*.mpxml" sLabeledXmlFilenamePattern = "*.mpxml"
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): # =============================================================================================================== lLabels = ['RB', 'RI', 'RE', 'RS', 'RO'] lIgnoredLabels = None nbClass = len(lLabels) """ if you play with a toy collection, which does not have all expected classes, you can reduce those. """ lActuallySeen = None if lActuallySeen: print "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING" lIgnoredLabels = [ lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen ] lLabels = [lLabels[i] for i in lActuallySeen] print len(lLabels), lLabels print len(lIgnoredLabels), lIgnoredLabels nbClass = len( lLabels) + 1 #because the ignored labels will become OTHER #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = Graph_MultiSinglePageXml nt = NodeType_PageXml_type_woText( "abp" #some short prefix because labels below are prefixed with it , lLabels, lIgnoredLabels, False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3) ) #we reduce overlap in this way ) # ntA = NodeType_PageXml_type_woText("abp" #some short prefix because labels below are prefixed with it # , lLabels # , lIgnoredLabels # , False #no label means OTHER # ) nt.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) # ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion" #how to find the nodes # , "./pc:TextEquiv") #how to get their text # ) DU_GRAPH.addNodeType(nt) # =============================================================================================================== DU_CRF_Task.__init__( self, sModelName, sModelDir, DU_GRAPH, dFeatureConfig={}, dLearnerConfig={ 'C': .1 if C is None else C, 'njobs': 8 if njobs is None else njobs, 'inference_cache': 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol': .05 if tol is None else tol, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 1000 if max_iter is None else max_iter }, sComment=sComment #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText , cFeatureDefinition=FeatureDefinition_PageXml_NoNodeFeat_v3) #self.setNbClass(3) #so that we check if all classes are represented in the training set self.bsln_mdl = self.addBaseline_LogisticRegression( ) #use a LR model trained by GridSearch as baseline
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ # Textline labels # Begin Inside End Single Other lLabels_BIESO = ['B', 'I', 'E', 'S', 'O'] lLabels_COLUMN_HEADER = ['CH', 'D', 'O',] # Cut lines: # Border Ignore Separator Outside lLabels_SIO_Cut = ['S', 'I', 'O'] #DEFINING THE CLASS OF GRAPH WE USE # this is an ad-hoc class where type1 and type2 are factorial, while type3 is artificial object DU_GRAPH = AdHocFactorialGraphCut_H DU_GRAPH.iBlockVisibility = cls.iBlockVisibility DU_GRAPH.iLineVisibility = cls.iLineVisibility # ROW ntR = NodeType_PageXml_type_woText("row" , lLabels_BIESO , None , False , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) ) ntR.setLabelAttribute("DU_row") ntR.setXpathExpr( (".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntR) # HEADER ntH = NodeType_PageXml_type_woText("hdr" , lLabels_COLUMN_HEADER , None , False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way ) ntH.setLabelAttribute("DU_header") ntH.setXpathExpr( (".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntH) # HEADER ntCutH = NodeType_PageXml_type_woText("sepH" , lLabels_SIO_Cut , None , False , None # equiv. to: BBoxDeltaFun=lambda _: 0 ) ntCutH.setLabelAttribute("type") ntCutH.setXpathExpr( ('.//pc:CutSeparator[@orient="0"]' #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntCutH) # The nodes of this type (called "classic") are directly extracted from the XML # the other types of nodes are computed DU_GRAPH.setClassicNodeTypeList([ntR]) DU_GRAPH.setSpecialNodeTypeList([ntCutH]) DU_GRAPH.setFactoredClassicalType(ntR, ntH) # make ntH a factorial of ntR return DU_GRAPH
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ lLabelsBIEOS_R = ['RB', 'RI', 'RE', 'RS'] #O? lLabelsSM_C = ['CM', 'CS'] # single cell, multicells # lLabels_OI = ['O','I'] # inside/outside a table # lLabels_SPAN = ['rspan','cspan','nospan','OTHER'] # lLabels_HEADER = ['O','I'] lIgnoredLabels = None # """ # if you play with a toy collection, which does not have all expected classes, you can reduce those. # """ # # lActuallySeen = None # if lActuallySeen: # print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING") # lIgnoredLabels = [lLabelsR[i] for i in range(len(lLabelsR)) if i not in lActuallySeen] # lLabels = [lLabelsR[i] for i in lActuallySeen ] # print( len(lLabelsR) , lLabelsR) # print( len(lIgnoredLabels) , lIgnoredLabels) #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = FactorialGraph_MultiContinuousPageXml # ROW ntR = NodeType_PageXml_type_woText( "abpr" #some short prefix because labels below are prefixed with it , lLabelsBIEOS_R, None, True #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3) ) #we reduce overlap in this way ) ntR.setLabelAttribute("DU_row") ntR.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntR) # COLUMN ntC = NodeType_PageXml_type_woText( "abpc" #some short prefix because labels below are prefixed with it , lLabelsSM_C, None, True #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3) ) #we reduce overlap in this way ) ntC.setLabelAttribute("DU_col") ntC.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntC) return DU_GRAPH
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ # Textline labels # Begin Inside End Single Other lLabels_BIESO = ['B', 'I', 'E', 'S', 'O'] # Grid lines: # Border Ignore Separator Outside lLabels_BISO_Grid = ['B', 'I', 'S', 'O'] #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = GraphGrid_H DU_GRAPH.iGridStep_H = cls.iGridStep_H DU_GRAPH.iGridStep_V = cls.iGridStep_V DU_GRAPH.iGridVisibility = cls.iGridVisibility DU_GRAPH.iBlockVisibility = cls.iBlockVisibility # ROW ntR = NodeType_PageXml_type_woText("row" , lLabels_BIESO , None , False #HISTORICAL FUNCTION IS (idiotic I think...): #, BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) , BBoxDeltaFun=lambda v: v / 5.0, #keep 2/3rd of the box # we reduce overlap in this way #this function returns the amount by which each border of # a bounding box is "shifted toward its centre"... # w,h = x2-x1, y2-y1 # dx = self.BBoxDeltaFun(w) # dy = self.BBoxDeltaFun(h) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] ) ntR.setLabelAttribute("DU_row") ntR.setXpathExpr( (".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntR) # HEADER ntGH = NodeType_PageXml_type_woText("gh" , lLabels_BISO_Grid , None , False , None # equiv. to: BBoxDeltaFun=lambda _: 0 ) ntGH.setLabelAttribute("type") ntGH.setXpathExpr( ('.//pc:GridSeparator[@orient="0"]' #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(ntGH) DU_GRAPH.setClassicNodeTypeList( [ntR ]) return DU_GRAPH