def convertDoc(self, sFilename): assert sFilename.endswith(self.sXml_HumanAnnotation_Extension) g = Graph_MultiPageXml() doc = etree.parse(sFilename, encoding='utf-8') #the Heigh/Ho annotation runs over consecutive pages, so we keep those values accross pages self._initSegmentationLabel() self.lSeenResoNum = list() for pnum, page, domNdPage in g._iter_Page_DocNode(doc): self._convertPageAnnotation(pnum, page, domNdPage) MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments) assert sFilename.endswith(self.sXml_HumanAnnotation_Extension) sDUFilename = sFilename[:-len(self.sXml_HumanAnnotation_Extension)] + self.sXml_MachineAnnotation_Extension # doc.save(sDUFilename, encoding='utf-8', pretty_print=True) doc.write(sDUFilename, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) # doc.saveFormatFileEnc(sDUFilename, "utf-8", True) #True to indent the XML # doc.freeDoc() return sDUFilename
def run(self): docdom = self.loadDom() self.convertTableCells(docdom) try: PageXml.setMetadata(docdom, None, 'NLE', Comments='TableCell 2 TextRegion') except ValueError: MultiPageXml.setMetadata(docdom, None, 'NLE', Comments='TableCell 2 TextRegion') return docdom
def annotateDocument(self,lsTrnColDir): """ from cell elements: annotate textLine elements """ #load graphs _, lFilename_trn = self.listMaxTimestampFile(lsTrnColDir, self.sXmlFilenamePattern) DU_GRAPH.addNodeType(ntA) lGraph_trn = DU_GRAPH.loadGraphs( lFilename_trn, bNeighbourhood=True, bLabelled=False,iVerbose=True) # get cells for i,graph in enumerate(lGraph_trn): lCells = filter(lambda x:x.node.name=='TextRegion',graph.lNode) lTextLine = filter(lambda x:x.node.name=='TextLine',graph.lNode) lTextLine = self.unLinkBadBaselines(lTextLine) lSeparator = filter(lambda x:x.node.name=='SeparatorRegion',graph.lNode) lPTextLine={} #sort textLine per page for tl in lTextLine: try: lPTextLine[tl.page].append(tl) except KeyError : lPTextLine[tl.page]=[tl] lPSep={} for sep in lSeparator: if (sep.x2 - sep.x1) > (sep.y2-sep.y1): try: lPSep[sep.page].append(sep) except KeyError : lPSep[sep.page]=[sep] # need to test if elements on the same page!! # same test with cell bottom! delta=50 # doc= lTextLine.node.doc # import libxml2 for cell in lCells: ## SEP try : lPSep[cell.page] for sep in lPSep[cell.page]: # extend height bcky2 =sep.y2 sep.y2=sep.y2+delta dh,dv= sep.getXYOverlap(cell) # print dh,dv # at least # sep situated in the horizontal borders if (dh > 0 and (abs(cell.y1 - sep.y1) < delta or abs(cell.y1 - sep.y2) < delta or abs(cell.y2 - sep.y2) < delta)): sep.node.setProp(sep.type.sLabelAttr,lLabels[5]) # print sep.y1, sep.y2, dh,dv, cell.x1, cell.y2 #restaure y2!! sep.y2 = bcky2 except KeyError :pass #SEP lText=[] for line in lPTextLine[cell.page]: dh,dv= line.getXYOverlap(cell) # at least if dh > (line.x2 - line.x1)*0.1 and dv > 0: line.node.unlinkNode() cell.node.addChild(line.node) lText.append(line) if len(lText) == 1: lText[0].node.setProp(lText[0].type.sLabelAttr,lLabels[3]) elif len(lText) >1: lText.sort(key=lambda x:x.y1) lText[0].node.setProp(lText[0].type.sLabelAttr,lLabels[0]) lText[-1].node.setProp(lText[-1].type.sLabelAttr,lLabels[2]) [x.node.setProp(x.type.sLabelAttr,lLabels[1]) for x in lText[1:-1]] # check if all labelled # if no labelled: add other ## TEXT for tl in lTextLine: try: sLabel = tl.type.parseDomNodeLabel(tl.node) # cls = DU_GRAPH._dClsByLabel[sLabel] #Here, if a node is not labelled, and no default label is set, then KeyError!!! # except KeyError: except ValueError: tl.node.setProp(tl.type.sLabelAttr,lLabels[4]) ## SEP for sep in lSeparator: # sLabel = sep.type.parseDomNodeLabel(sep.node) try: sLabel = sep.type.parseDomNodeLabel(sep.node) # cls = DU_GRAPH._dClsByLabel[sLabel] #Here, if a node is not labelled, and no default label is set, then KeyError!!! except ValueError: sep.node.setProp(sep.type.sLabelAttr,lLabels[6]) doc =graph.doc MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments) # save mpxml as mpxml.orig sDUFilename = lFilename_trn[i][:-len(MultiPageXml.sEXT)]+ self.sLabeledXmlFilenameEXT print (sDUFilename) doc.saveFormatFileEnc(sDUFilename, "utf-8", True) #True to indent the XML doc.freeDoc()
def runForExternalMLMethod(self, lsColDir, storeX, applyY, bRevertEdges=False): """ HACK: to test new ML methods, not yet integrated in our SW: storeX=None, storeXY=None, applyY=None Return the list of produced files """ self.traceln("-" * 50) if storeX: traceln("Loading data and storing [X] (1 X per graph)") if applyY: traceln( "Loading data, loading Y, labelling data, storing annotated data" ) self.traceln("-" * 50) if storeX and applyY: raise ValueError("Either store X or applyY, not both") if not self._mdl: raise Exception("The model must be loaded beforehand!") #list files _, lFilename = self.listMaxTimestampFile(lsColDir, self.sXmlFilenamePattern) DU_GraphClass = self.getGraphClass() lPageConstraint = DU_GraphClass.getPageConstraint() if lPageConstraint: for dat in lPageConstraint: self.traceln("\t\t%s" % str(dat)) if applyY: self.traceln("LOADING [Y] from %s" % applyY) lY = self._mdl.gzip_cPickle_load(applyY) if storeX: lX = [] chronoOn("predict") self.traceln( "- loading collection as graphs, and processing each in turn. (%d files)" % len(lFilename)) du_postfix = "_du" + MultiPageXml.sEXT lsOutputFilename = [] for sFilename in lFilename: if sFilename.endswith(du_postfix): continue #:) chronoOn("predict_1") lg = DU_GraphClass.loadGraphs([sFilename], bDetach=False, bLabelled=False, iVerbose=1) #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list if lg: for g in lg: doc = g.doc if bRevertEdges: g.revertEdges() #revert the directions of the edges if lPageConstraint: self.traceln( "\t- prediction with logical constraints: %s" % sFilename) else: self.traceln("\t- prediction : %s" % sFilename) if storeX: [X] = self._mdl.get_lX([g]) lX.append(X) else: Y = lY.pop(0) g.setDomLabels(Y) del lg if applyY: MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments) sDUFilename = sFilename[:-len(MultiPageXml.sEXT )] + du_postfix doc.saveFormatFileEnc(sDUFilename, "utf-8", True) #True to indent the XML doc.freeDoc() lsOutputFilename.append(sDUFilename) else: self.traceln("\t- no prediction to do for: %s" % sFilename) self.traceln("\t done [%.2fs]" % chronoOff("predict_1")) self.traceln(" done [%.2fs]" % chronoOff("predict")) if storeX: self.traceln("STORING [X] in %s" % storeX) self._mdl.gzip_cPickle_dump(storeX, lX) return lsOutputFilename
def predict(self, lsColDir, docid=None): """ Return the list of produced files """ self.traceln("-" * 50) self.traceln("Predicting for collection(s):", lsColDir) self.traceln("-" * 50) if not self._mdl: raise Exception("The model must be loaded beforehand!") #list files if docid is None: _, lFilename = self.listMaxTimestampFile(lsColDir, self.sXmlFilenamePattern) # predict for this file only else: try: lFilename = [ os.path.abspath( os.path.join(lsColDir[0], docid + MultiPageXml.sEXT)) ] except IndexError: raise Exception("a collection directory must be provided!") DU_GraphClass = self.getGraphClass() lPageConstraint = DU_GraphClass.getPageConstraint() if lPageConstraint: for dat in lPageConstraint: self.traceln("\t\t%s" % str(dat)) chronoOn("predict") self.traceln( "- loading collection as graphs, and processing each in turn. (%d files)" % len(lFilename)) du_postfix = "_du" + MultiPageXml.sEXT lsOutputFilename = [] for sFilename in lFilename: if sFilename.endswith(du_postfix): continue #:) chronoOn("predict_1") lg = DU_GraphClass.loadGraphs([sFilename], bDetach=False, bLabelled=False, iVerbose=1) #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list if lg: for g in lg: doc = g.doc if lPageConstraint: self.traceln( "\t- prediction with logical constraints: %s" % sFilename) else: self.traceln("\t- prediction : %s" % sFilename) Y = self._mdl.predict(g) g.setDomLabels(Y) del Y del lg MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments) sDUFilename = sFilename[:-len(MultiPageXml.sEXT)] + du_postfix doc.write(sDUFilename, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) lsOutputFilename.append(sDUFilename) else: self.traceln("\t- no prediction to do for: %s" % sFilename) self.traceln("\t done [%.2fs]" % chronoOff("predict_1")) self.traceln(" done [%.2fs]" % chronoOff("predict")) return lsOutputFilename