コード例 #1
0
    def convertDoc(self, sFilename):
        
        assert sFilename.endswith(self.sXml_HumanAnnotation_Extension)
        
        g = Graph_MultiPageXml()
        
        doc = etree.parse(sFilename, encoding='utf-8')

        #the Heigh/Ho annotation runs over consecutive pages, so we keep those values accross pages
        self._initSegmentationLabel()
        self.lSeenResoNum = list()

        for pnum, page, domNdPage in g._iter_Page_DocNode(doc):
            self._convertPageAnnotation(pnum, page, domNdPage)
            
        MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments)
        
        assert sFilename.endswith(self.sXml_HumanAnnotation_Extension)
        
        sDUFilename = sFilename[:-len(self.sXml_HumanAnnotation_Extension)] + self.sXml_MachineAnnotation_Extension
#         doc.save(sDUFilename, encoding='utf-8', pretty_print=True)
        doc.write(sDUFilename,
                  xml_declaration=True,
                  encoding="utf-8",
                  pretty_print=True
                  #compression=0,  #0 to 9
                  )
        
#         doc.saveFormatFileEnc(sDUFilename, "utf-8", True)  #True to indent the XML
#         doc.freeDoc()     
        
        return sDUFilename
コード例 #2
0
    def run(self):

        docdom = self.loadDom()
        self.convertTableCells(docdom)
        try:
            PageXml.setMetadata(docdom,
                                None,
                                'NLE',
                                Comments='TableCell 2 TextRegion')
        except ValueError:
            MultiPageXml.setMetadata(docdom,
                                     None,
                                     'NLE',
                                     Comments='TableCell 2 TextRegion')
        return docdom
コード例 #3
0
    def annotateDocument(self,lsTrnColDir):
        """
            from cell elements: annotate textLine elements 
        """
        
        #load graphs
        _, lFilename_trn = self.listMaxTimestampFile(lsTrnColDir, self.sXmlFilenamePattern)

        DU_GRAPH.addNodeType(ntA)
        lGraph_trn = DU_GRAPH.loadGraphs( lFilename_trn, bNeighbourhood=True, bLabelled=False,iVerbose=True)


        # get cells
        for i,graph in enumerate(lGraph_trn):

            lCells = filter(lambda x:x.node.name=='TextRegion',graph.lNode)
            lTextLine = filter(lambda x:x.node.name=='TextLine',graph.lNode)
            lTextLine = self.unLinkBadBaselines(lTextLine)
            lSeparator = filter(lambda x:x.node.name=='SeparatorRegion',graph.lNode)
            lPTextLine={}
            #sort textLine per page
            for tl in lTextLine:
                try: lPTextLine[tl.page].append(tl)
                except KeyError : lPTextLine[tl.page]=[tl]

            lPSep={}
            for sep in lSeparator:
                if (sep.x2 - sep.x1)  > (sep.y2-sep.y1):
                    try: lPSep[sep.page].append(sep)
                    except KeyError : lPSep[sep.page]=[sep]
                
            # need to test if elements on the same page!!
            # same test  with cell bottom!
            delta=50
#             doc= lTextLine.node.doc
#             import libxml2
            for cell in lCells:
                ## SEP
                try :
                    lPSep[cell.page]
                    for sep in lPSep[cell.page]:
                        # extend height 
                        bcky2 =sep.y2
                        sep.y2=sep.y2+delta
                        dh,dv=  sep.getXYOverlap(cell)
#                             print dh,dv
                        # at least  
                        # sep situated in the horizontal borders
                        if  (dh > 0 and (abs(cell.y1 - sep.y1) < delta or abs(cell.y1 - sep.y2) < delta  or abs(cell.y2 - sep.y2) < delta)):                        
                            sep.node.setProp(sep.type.sLabelAttr,lLabels[5])
#                             print sep.y1,  sep.y2, dh,dv, cell.x1, cell.y2
                        #restaure y2!!
                        sep.y2 = bcky2
                except KeyError :pass
                
                #SEP  
                lText=[]
                for line in lPTextLine[cell.page]:
                    dh,dv=  line.getXYOverlap(cell)
                    # at least  
                    if dh > (line.x2 - line.x1)*0.1 and dv > 0:
                        line.node.unlinkNode()
                        cell.node.addChild(line.node)
                        lText.append(line)
                if len(lText) == 1:
                    lText[0].node.setProp(lText[0].type.sLabelAttr,lLabels[3])
                elif len(lText) >1:
                    lText.sort(key=lambda x:x.y1)
                    lText[0].node.setProp(lText[0].type.sLabelAttr,lLabels[0])
                    lText[-1].node.setProp(lText[-1].type.sLabelAttr,lLabels[2])
                    [x.node.setProp(x.type.sLabelAttr,lLabels[1]) for x in lText[1:-1]]

                
            # check if all labelled
            # if no labelled: add other
            ## TEXT
            for tl in lTextLine:
                try:
                    sLabel = tl.type.parseDomNodeLabel(tl.node)
#                     cls = DU_GRAPH._dClsByLabel[sLabel]  #Here, if a node is not labelled, and no default label is set, then KeyError!!!
#                 except KeyError:
                except ValueError:              
                    tl.node.setProp(tl.type.sLabelAttr,lLabels[4])
            ## SEP
            for sep in lSeparator:
#                 sLabel = sep.type.parseDomNodeLabel(sep.node)
                try:
                    sLabel = sep.type.parseDomNodeLabel(sep.node)
#                     cls = DU_GRAPH._dClsByLabel[sLabel]  #Here, if a node is not labelled, and no default label is set, then KeyError!!!
                except ValueError:  
                    sep.node.setProp(sep.type.sLabelAttr,lLabels[6])
            
            
            doc =graph.doc    
            MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments)
            # save mpxml as mpxml.orig
            sDUFilename = lFilename_trn[i][:-len(MultiPageXml.sEXT)]+  self.sLabeledXmlFilenameEXT
            print (sDUFilename)
            doc.saveFormatFileEnc(sDUFilename, "utf-8", True)  #True to indent the XML
            doc.freeDoc()
コード例 #4
0
ファイル: DU_CRF_Task.py プロジェクト: liladude/TranskribusDU
    def runForExternalMLMethod(self,
                               lsColDir,
                               storeX,
                               applyY,
                               bRevertEdges=False):
        """
        HACK: to test new ML methods, not yet integrated in our SW: storeX=None, storeXY=None, applyY=None
        Return the list of produced files
        """

        self.traceln("-" * 50)
        if storeX: traceln("Loading data and storing [X] (1 X per graph)")
        if applyY:
            traceln(
                "Loading data, loading Y, labelling data, storing annotated data"
            )
        self.traceln("-" * 50)

        if storeX and applyY:
            raise ValueError("Either store X or applyY, not both")

        if not self._mdl:
            raise Exception("The model must be loaded beforehand!")

        #list files
        _, lFilename = self.listMaxTimestampFile(lsColDir,
                                                 self.sXmlFilenamePattern)

        DU_GraphClass = self.getGraphClass()

        lPageConstraint = DU_GraphClass.getPageConstraint()
        if lPageConstraint:
            for dat in lPageConstraint:
                self.traceln("\t\t%s" % str(dat))

        if applyY:
            self.traceln("LOADING [Y] from %s" % applyY)
            lY = self._mdl.gzip_cPickle_load(applyY)
        if storeX: lX = []

        chronoOn("predict")
        self.traceln(
            "- loading collection as graphs, and processing each in turn. (%d files)"
            % len(lFilename))
        du_postfix = "_du" + MultiPageXml.sEXT
        lsOutputFilename = []
        for sFilename in lFilename:
            if sFilename.endswith(du_postfix): continue  #:)
            chronoOn("predict_1")
            lg = DU_GraphClass.loadGraphs([sFilename],
                                          bDetach=False,
                                          bLabelled=False,
                                          iVerbose=1)
            #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list
            if lg:
                for g in lg:
                    doc = g.doc
                    if bRevertEdges:
                        g.revertEdges()  #revert the directions of the edges
                    if lPageConstraint:
                        self.traceln(
                            "\t- prediction with logical constraints: %s" %
                            sFilename)
                    else:
                        self.traceln("\t- prediction : %s" % sFilename)
                    if storeX:
                        [X] = self._mdl.get_lX([g])
                        lX.append(X)
                    else:
                        Y = lY.pop(0)
                        g.setDomLabels(Y)
                del lg

                if applyY:
                    MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator,
                                             self.sMetadata_Comments)
                    sDUFilename = sFilename[:-len(MultiPageXml.sEXT
                                                  )] + du_postfix
                    doc.saveFormatFileEnc(sDUFilename, "utf-8",
                                          True)  #True to indent the XML
                    doc.freeDoc()
                    lsOutputFilename.append(sDUFilename)
            else:
                self.traceln("\t- no prediction to do for: %s" % sFilename)

            self.traceln("\t done [%.2fs]" % chronoOff("predict_1"))
        self.traceln(" done [%.2fs]" % chronoOff("predict"))

        if storeX:
            self.traceln("STORING [X] in %s" % storeX)
            self._mdl.gzip_cPickle_dump(storeX, lX)

        return lsOutputFilename
コード例 #5
0
ファイル: DU_CRF_Task.py プロジェクト: liladude/TranskribusDU
    def predict(self, lsColDir, docid=None):
        """
        Return the list of produced files
        """
        self.traceln("-" * 50)
        self.traceln("Predicting for collection(s):", lsColDir)
        self.traceln("-" * 50)

        if not self._mdl:
            raise Exception("The model must be loaded beforehand!")

        #list files
        if docid is None:
            _, lFilename = self.listMaxTimestampFile(lsColDir,
                                                     self.sXmlFilenamePattern)
        # predict for this file only
        else:
            try:
                lFilename = [
                    os.path.abspath(
                        os.path.join(lsColDir[0], docid + MultiPageXml.sEXT))
                ]
            except IndexError:
                raise Exception("a collection directory must be provided!")

        DU_GraphClass = self.getGraphClass()

        lPageConstraint = DU_GraphClass.getPageConstraint()
        if lPageConstraint:
            for dat in lPageConstraint:
                self.traceln("\t\t%s" % str(dat))

        chronoOn("predict")
        self.traceln(
            "- loading collection as graphs, and processing each in turn. (%d files)"
            % len(lFilename))
        du_postfix = "_du" + MultiPageXml.sEXT
        lsOutputFilename = []
        for sFilename in lFilename:
            if sFilename.endswith(du_postfix): continue  #:)
            chronoOn("predict_1")
            lg = DU_GraphClass.loadGraphs([sFilename],
                                          bDetach=False,
                                          bLabelled=False,
                                          iVerbose=1)
            #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list
            if lg:
                for g in lg:
                    doc = g.doc
                    if lPageConstraint:
                        self.traceln(
                            "\t- prediction with logical constraints: %s" %
                            sFilename)
                    else:
                        self.traceln("\t- prediction : %s" % sFilename)
                    Y = self._mdl.predict(g)

                    g.setDomLabels(Y)
                    del Y
                del lg

                MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator,
                                         self.sMetadata_Comments)
                sDUFilename = sFilename[:-len(MultiPageXml.sEXT)] + du_postfix
                doc.write(sDUFilename,
                          xml_declaration=True,
                          encoding="utf-8",
                          pretty_print=True
                          #compression=0,  #0 to 9
                          )

                lsOutputFilename.append(sDUFilename)
            else:
                self.traceln("\t- no prediction to do for: %s" % sFilename)

            self.traceln("\t done [%.2fs]" % chronoOff("predict_1"))
        self.traceln(" done [%.2fs]" % chronoOff("predict"))

        return lsOutputFilename