Example #1
0
 def parseDomNodeLabel(self, domnode, defaultCls=None):
     """
     Parse and set the graph node label and return its class index
     raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
     """
     sLabel = self.sDefaultLabel
     try:
         try:
             sXmlLabel = PageXml.getCustomAttr(domnode, self.sCustAttr_STRUCTURE, self.sCustAttr2_TYPE)
         except PageXmlException as e:
             if self.bOther:
                 return self.sDefaultLabel  #absence of label but bOther was True (I guess)
             else:
                 raise e
         try:
             sLabel = self.dXmlLabel2Label[sXmlLabel]
         except KeyError:
             #not a label of interest
             try:
                 self.checkIsIgnored(sXmlLabel)
                 #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel: 
             except:
                 raise ValueError("Invalid label '%s' in node %s"%(sXmlLabel, etree.tostring(domnode)))
     except KeyError:
         #no label at all
         if not self.sDefaultLabel: raise ValueError("Missing label in node %s"%etree.tostring(domnode))
     
     return sLabel
Example #2
0
    def _convertPageAnnotation(self, pnum, page, domNdPage):
        """
         
        """
        for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):

            try:
                sResoNum = None
                lbl = PageXml.getCustomAttr(nd, "structure", "type")

                if lbl in ["heading"]:
                    semLabel = self.dAnnotMapping[lbl]
                    #heading may indicate a new resolution!
                    if self.prevResolutionNumber == None:
                        sgmLabel = self._getCurrentSegmentationLabel(
                        )  #for instance 2 consecutive headings
                    else:
                        sgmLabel = self._switchSegmentationLabel()
                        self.prevResolutionNumber = None  #so that next number does not switch Heigh/Ho label
                elif lbl in ["header", "page-number", "marginalia"]:
                    #continuation of a resolution
                    semLabel = self.dAnnotMapping[lbl]
                    sgmLabel = self._getCurrentSegmentationLabel()
                else:
                    o = self.creResolutionHumanLabel.match(lbl)
                    if not o:
                        raise ValueError("%s is not a valid human annotation" %
                                         lbl)
                    semLabel = self.dAnnotMapping[o.group(
                        1)]  #"" for the resolution number

                    #Here we have a resolution number!
                    sResoNum = o.group(2)
                    if not sResoNum:
                        raise ValueError(
                            "%s is not a valid human annotation - missing resolution number"
                            % lbl)

                    #now switch between heigh and ho !! :))
                    if self.prevResolutionNumber != None and self.prevResolutionNumber != sResoNum:
                        #we got a new number, so switching segmentation label!
                        sgmLabel = self._switchSegmentationLabel()
                    else:
                        #either same number or switching already done due to a heading
                        sgmLabel = self._getCurrentSegmentationLabel()

                    self.prevResolutionNumber = sResoNum

            except PageXmlException:
                semLabel = self.sOther
                sgmLabel = self._getCurrentSegmentationLabel()

            nd.set(self.sSemAttr, semLabel)
            nd.set(self.sSgmAttr, sgmLabel)
            if sResoNum:
                nd.set(self.sNumAttr, sResoNum
                       )  #only when the number is part of the humanannotation!
Example #3
0
    def _convertPageAnnotation(self, pnum, page, domNdPage):
        """
        
        """

        #change: on each page we start by Heigh
        bRestartAtEachPageWithHeigh = True
        if bRestartAtEachPageWithHeigh: self._initSegmentationLabel()

        for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):

            try:
                lbl = PageXml.getCustomAttr(nd, "structure", "type")
            except PageXmlException:
                nd.set(self.sSemAttr, self.sOther)
                nd.set(self.sSgmAttr, self.sOther)
                continue  #this node has no annotation whatsoever

            if lbl in ["heading", "header", "page-number", "marginalia"]:
                semLabel = lbl
                sgmLabel = self.sOther  #those elements are not part of a resolution
                sResoNum = None
            else:
                o = self.creResolutionHumanLabel.match(lbl)
                if not o:
                    raise ValueError("%s is not a valid human annotation" %
                                     lbl)
                semLabel = o.group(1)  #"" for the resolution number

                #now decide on the segmentation label
                sResoNum = o.group(2)
                if not sResoNum:
                    raise ValueError(
                        "%s is not a valid human annotation - missing resolution number"
                        % lbl)

                #now switch between heigh and ho !! :))
                if self.prevResolutionNumber == sResoNum:
                    sgmLabel = self.prevSgmLbl
                else:
                    sgmLabel = self._getNextSegmentationLabel(self.prevSgmLbl)
                    assert bRestartAtEachPageWithHeigh or sResoNum not in self.lSeenResoNum, "ERROR: the ordering of the block has not preserved resolution number contiguity"
                    self.lSeenResoNum.append(sResoNum)

                self.prevResolutionNumber, self.prevSgmLbl = sResoNum, sgmLabel

            #always have a semantic label
            sNewSemLbl = self.dAnnotMapping[semLabel]
            assert sNewSemLbl
            nd.set(self.sSemAttr, sNewSemLbl)  #DU annotation

            #resolution parts also have a segmentation label and a resolution number
            assert sgmLabel
            nd.set(self.sSgmAttr, sgmLabel)  #DU annotation

            if sResoNum:
                nd.set(self.sNumAttr, sResoNum)
Example #4
0
 def parseDocNodeLabel(self, graph_node, defaultCls=None):
     """
     Parse and set the graph node label and return its class index
     We rely on the standard self.sLabelAttr
     raise a ValueError if the label is missing while bOther was not True
      , or if the label is neither a valid one nor an ignored one
     """
     
     ndParent = graph_node.node.getparent()
     try:
         sLabel = "%s_%s" % ( self.sLabelAttr,
                         PageXml.getCustomAttr(ndParent, 'structure','type')
                         )
     except :
         sLabel='type_None'
     return sLabel
Example #5
0
def test_getsetCustomAttr():
    sXml = b"""
            <TextRegion type="page-number" id="p1_region_1471502505726_2" custom="readingOrder {index:9;} structure {type:page-number;}">
                <Coords points="972,43 1039,43 1039,104 972,104"/>
            </TextRegion>
            """
    doc = etree.parse(BytesIO(sXml))
    nd = doc.getroot()
    assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '9'
    assert PageXml.setCustomAttr(nd, "readingOrder", "index", 99) == 99
    assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '99'

    assert PageXml.getCustomAttr(nd, "readingOrder") == {'index': '99'}

    assert PageXml.setCustomAttr(nd, "readingOrder", "toto", "zou") == "zou"
    assert PageXml.getCustomAttr(nd, "readingOrder", "toto") == 'zou'

    with pytest.raises(PageXmlException):
        PageXml.getCustomAttr(nd, "readingOrder", "axiste_pas")
    with pytest.raises(PageXmlException):
        PageXml.getCustomAttr(nd, "axiste_pas_non_plus", "axiste_pas")
 def parsePage(self, doc, ctxtNd, name):
     for tag in self.lTag:
         lNdTag = ctxtNd.xpath(".//pg:%s"%tag, namespaces=self.dNS)
         for nd in lNdTag:
             self.seenDocTag(name, tag)
             if self.sCustom != None:
                 if self.sCustom == "":
                     try:
                         lbl = PageXml.getCustomAttr(nd, "structure", "type")
                     except:
                         lbl = ''
                 else:
                     lbl = nd.get(self.sCustom)
             else:
                 lbl = nd.get("type")
                 
             if lbl:
                 for cre, sRepl in self.ltCRES: lbl = cre.sub(sRepl, lbl)    #pattern processing 
                 self.seenTagLabel(tag, lbl)
    def _convertPageAnnotation(self, pnum, page, domNdPage):
        """
         
        """
        for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):
            sResoNum = None
            bCurrentIsAStart = None
            try:
                lbl = PageXml.getCustomAttr(nd, "structure", "type")
                 
                if lbl == "heading":  
                    semLabel                  = self.dAnnotMapping[lbl]
                    #heading indicate the start of a new resolution, unless the previous is already a start!
                    if self._prevIsB: 
                        bCurrentIsAStart = False
                    else:
                        bCurrentIsAStart = True
                        self._prevNum = False #to prevent starting again when find the resolution number
                elif lbl in ["header", "page-number", "marginalia"]:
                    semLabel         = self.dAnnotMapping[lbl]
                    #continuation of a resolution, except at very beginning (first node)
                    if self._prevNd == None:
                        bCurrentIsAStart = True
                    else:
                        bCurrentIsAStart = False
                else:
                    o = self.creResolutionHumanLabel.match(lbl)
                    if not o:
                    
                        if False:  # strict
                            raise ValueError("%s is not a valid human annotation" % lbl)
                        else:
                            # relaxed
                            print(" ** WARNING ** strange annotation on node id=%s : '%s'"%(nd.get("id"), lbl))
                            semLabel = self.dAnnotMapping[None]                          
                            #Here we have a resolution number!
                            sResoNum = self._prevNum
                    else:
                    semLabel = self.dAnnotMapping[o.group(1)]   #"" for the resolution number
                     
                    #Here we have a resolution number!
                    sResoNum = o.group(2)
                    if not sResoNum: raise ValueError("%s is not a valid human annotation - missing resolution number" % lbl)
                     
                    if self._prevNum != False and self._prevNum != sResoNum:
                        #we got a new number, so switching segmentation label!  
                        bCurrentIsAStart = True
                    else:
                        #either same number or switching already done due to a heading
                        bCurrentIsAStart = False
                    self._prevNum = sResoNum
 
                 
            except PageXmlException:
                semLabel = self.sOther
                bCurrentIsAStart = False
                 
            #Now tagging!!
            #Semantic (easy)
            nd.set(self.sSemAttr, semLabel)

            # BIES, tough... 
            if bCurrentIsAStart:
                if self._prevIsB:
                    #make previous a singleton!
                    if self._prevNd: self._prevNd.set(self.sSgmAttr, self.S)
                else:
                    #make previous a End
                    if self._prevNd: self._prevNd.set(self.sSgmAttr, self.E)
                self._prevIsB = True #for next cycle!
            else:
                if self._prevIsB:
                    #confirm previous a a B
                    if self._prevNd: self._prevNd.set(self.sSgmAttr, self.B)
                else:
                    #confirm previous as a I
                    if self._prevNd: self._prevNd.set(self.sSgmAttr, self.I)
                self._prevIsB = False #for next cycle!

            if sResoNum: nd.set(self.sNumAttr, sResoNum) #only when the number is part of the humanannotation!
            self._prevNd  = nd #for next cycle!
        # end for
        
        if self._prevIsB:
            #make previous a singleton!
            if self._prevNd: self._prevNd.set(self.sSgmAttr, self.S)
        else:
            #make previous a End
            if self._prevNd: self._prevNd.set(self.sSgmAttr, self.E)
        return