def run(self, doc): """ take a set of line in a page and mine it """ self.doc = doc # use the lite version self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() if self.bManual: self.processWithTemplate(self.manualPattern, self.lPages) else: # self.mainMining(self.lPages) for page in self.lPages: print("page") lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: self.columnMining(table) self.addTagProcessToMetadata(self.doc) return self.doc
def run(self, domDoc): """ conversion """ ODoc = XMLDSDocument() # ODoc.lastPage=1 ODoc.loadFromDom(domDoc) lPageXmlDoc = [] lPages = ODoc.getPages() for page in lPages: # print("%s %s"%(page, page.getAttribute('imageFilename'))) try: filename = os.path.basename(page.getAttribute('imageFilename')) except: filename = "fakename" pageXmlDoc, pageNode = PageXml.createPageXmlDocument( creatorName='NLE', filename=filename, imgW=convertDot2Pixel(self.dpi, page.getWidth()), imgH=convertDot2Pixel(self.dpi, page.getHeight())) self.pageXmlNS = etree.QName(pageXmlDoc.getroot()).namespace if self.bRegionOnly: self.convertOnlyRegion(page, pageNode) else: self.convertDSPage(page, pageNode) lPageXmlDoc.append( (pageXmlDoc, page.getAttribute('imageFilename'))) return lPageXmlDoc
def run(self): """ take a set of line in a page and mine it """ # use the lite version self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc,listPages=range(self.firstPage,self.lastPage+1)) self.lPages= self.ODoc.getPages() if self.bManual: self.processWithTemplate(self.manualPattern,self.lPages) else: self.mainLineMining(self.lPages) # lRes = self.mineLineFeature(self.lPages) # print lRes # returns the hierarchical set of elements (a list) # for page , region, tree in lRes: # self.tagDom(page, region) # return self.addTagProcessToMetadata(self.doc) return self.doc
def run(self, doc): """ main issue: how to select the template: to be done by CVL assuming IE and htr info are stored in the template """ # self.firstPage = 117 # self.lastPage= 118 if self.page2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() if self.recordType == 'D': record = deathRecord(self.sModelName, self.sModelDir) foo = self.processDeathWithTemplate elif self.recordType == 'W': record = weddingRecord(self.sModelName, self.sModelDir) foo = self.processWeddingWithTemplate elif self.recordType == 'B': record = birthRecord(self.sModelName, self.sModelDir) foo = self.processBirthWithTemplate else: print(f'record type not supported: {self.recordType}') for page in self.lPages: print("page: ", page.getNumber()) # self.testGTText(page) # continue lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: if table.getNbRows() < 2: if self.bDebug: print("page: %s : not a table? %d/%d" % (page.getNumber(), table.getNbRows(), table.getNbColumns())) continue if self.BuseStoredTemplate: if self.bDebug: print("page: %s : table %d/%d" % (page.getNumber(), table.getNbRows(), table.getNbColumns())) foo(table, record) else: self.mineTable(table, record) self.evalData = record.generateOutput(self.evalData)
def createRefPerPage(self, doc): """ create a ref file from the xml one for DAS 2018: one ref per graph(page) """ self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(doc, listPages=range(self.firstPage, self.lastPage + 1)) dRows = {} for page in self.ODoc.getPages(): #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0"> pageNode = etree.Element('PAGE') # pageNode.set("number",page.getAttribute('number')) #SINGLER PAGE pnum=1 pageNode.set("number", '1') pageNode.set("imageFilename", page.getAttribute('imageFilename')) pageNode.set("width", page.getAttribute('width')) pageNode.set("height", page.getAttribute('height')) root = etree.Element("DOCUMENT") refdoc = etree.ElementTree(root) root.append(pageNode) lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: tableNode = etree.Element('TABLE') tableNode.set("x", table.getAttribute('x')) tableNode.set("y", table.getAttribute('y')) tableNode.set("width", table.getAttribute('width')) tableNode.set("height", table.getAttribute('height')) pageNode.append(tableNode) for cell in table.getAllNamedObjects(XMLDSTABLECELLClass): try: dRows[int(cell.getAttribute("row"))].append(cell) except KeyError: dRows[int(cell.getAttribute("row"))] = [cell] lYcuts = [] for rowid in sorted(dRows.keys()): # print rowid, min(map(lambda x:x.getY(),dRows[rowid])) lYcuts.append( min(list(map(lambda x: x.getY(), dRows[rowid])))) self.createRowsWithCuts(lYcuts, table, tableNode) self.outputFileName = os.path.basename( page.getAttribute('imageFilename')[:-3] + 'ref') print(self.outputFileName) self.writeDom(refdoc, bIndent=True) return refdoc
def run(self, doc): """ main issue: how to select the template: to be done by CVL assuming IE and htr info are stored in the template """ # self.firstPage = 9 # self.lastPage= 9 if self.page2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() dr = deathRecord(self.sModelName, self.sModelDir) ## selection of the templates first with X tables ### for page in self.lPages: print("page: ", page.getNumber()) # self.testGTText(page) # continue lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: if table.getNbRows() < 2: print("page: %s : not a table? %d/%d" % (page.getNumber(), table.getNbRows(), table.getNbColumns())) continue if self.BuseStoredTemplate: # self.processWithTemplate(table, dr) try: self.processWithTemplate(table, dr) except: print('issue with page %s' % page) else: self.mineTable(table, dr) self.evalData = dr.generateOutput(self.evalData)
def run(self, doc): """ load dom and find rows """ # conver to DS if needed if self.bCreateRef: if self.do2DS: dsconv = primaAnalysis() doc = dsconv.convert2DS(doc, self.docid) refdoc = self.createRef(doc) return refdoc # single ref per page refdoc = self.createRefPerPage(doc) return None if self.do2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() self.checkInputFormat(self.lPages) self.findColumnsInDoc(self.lPages) if self.bMining: self.documentMining(self.lPages) if self.bCreateRef: refdoc = self.createRef(self.doc) return refdoc # if self.do2DS: # # bakc to PageXml # conv= DS2PageXMLConvertor() # lPageXDoc = conv.run(self.doc) # conv.storeMultiPageXml(lPageXDoc,self.getOutputFileName()) # print self.getOutputFileName() # return None return self.doc
def createRef(self, doc): """ create a ref file from the xml one """ self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(doc, listPages=range(self.firstPage, self.lastPage + 1)) root = etree.Element("DOCUMENT") refdoc = etree.ElementTree(root) for page in self.ODoc.getPages(): #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0"> pageNode = etree.Element('PAGE') pageNode.set("number", page.getAttribute('number')) pageNode.set("pagekey", os.path.basename(page.getAttribute('imageFilename'))) pageNode.set("width", page.getAttribute('width')) pageNode.set("height", page.getAttribute('height')) root.append(pageNode) lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: dCol = {} tableNode = etree.Element('TABLE') tableNode.set("x", table.getAttribute('x')) tableNode.set("y", table.getAttribute('y')) tableNode.set("width", table.getAttribute('width')) tableNode.set("height", table.getAttribute('height')) pageNode.append(tableNode) for cell in table.getAllNamedObjects(XMLDSTABLECELLClass): try: dCol[int(cell.getAttribute("col"))].append(cell) except KeyError: dCol[int(cell.getAttribute("col"))] = [cell] lXcuts = [] for colid in sorted(dCol.keys()): lXcuts.append( min(list(map(lambda x: x.getX(), dCol[colid])))) self.createColumnsWithCuts(lXcuts, table, tableNode) return refdoc
def run(self, doc): """ load dom and find rows """ # conver to DS if needed if self.bCreateRef: if self.do2DS: dsconv = primaAnalysis() doc = dsconv.convert2DS(doc, self.docid) refdoc = self.createRef(doc) return refdoc # single ref per page refdoc = self.createRefPerPage(doc) return None if self.do2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) # self.ODoc.loadFromDom(self.doc,listPages = range(30,31)) self.findColumnsInDoc(self.ODoc) refdoc = self.createRef(self.doc) # print refdoc.serialize('utf-8', 1) if self.do2DS: # bakc to PageXml conv = DS2PageXMLConvertor() lPageXDoc = conv.run(self.doc) conv.storeMultiPageXml(lPageXDoc, self.getOutputFileName()) # print self.getOutputFileName() return None return self.doc
def processDocument(self, coldir, colid, docid, dom=None): """ process a single document 1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml 2 python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form 3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml 6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py <model-name> <dictionary-name> 5400 17442 wait 8 python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force #covnert to ds 9 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate """ #create Transkribus client self.myTrKCient = TranskribusClient(sServerUrl=self.server, proxies={}, loggingLevel=logging.WARN) #login _ = self.login(self.myTrKCient, trace=trace, traceln=traceln) # self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True) ## load dom if dom is None: self.inputFileName = os.path.abspath( os.path.join(coldir, TableProcessing.sCOL, docid + TableProcessing.sMPXMLExtension)) mpxml_doc = self.loadDom() nbPages = MultiPageXml.getNBPages(mpxml_doc) else: # load provided mpxml mpxml_doc = dom nbPages = MultiPageXml.getNBPages(mpxml_doc) # ### table registration: need to compute/select??? the template # # perform LA separator, table registration, baseline with normalization # #python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form # tableregtool= LAProcessor() # # latool.setParams(dParams) # tableregtool.coldir = coldir # tableregtool.docid = docid # tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False # # creates xml and a new mpxml # mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc) # # # self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done') lJobIDs = self.applyLA_URO(colid, docid, nbPages) return bWait = True assert lJobIDs != [] jobid = lJobIDs[-1] traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED'] ## coldir??? self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True) ##STOP HERE FOR DAS newx testset: return # tag text for BIES cell #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 """ needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir, """ doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir) doer.load() ## needed predict at file level, and do not store dom, but return it rowpath = os.path.join(coldir, "col") BIESFiles = doer.predict([rowpath], docid) BIESDom = self.loadDom(BIESFiles[0]) # res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True) # MPXML2DS #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 dsconv = primaAnalysis() DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid) # create XMLDOC object self.ODoc = XMLDSDocument() self.ODoc.loadFromDom( DSBIESdoc) #,listPages = range(self.firstPage,self.lastPage+1)) # create row #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml rdc = RowDetection() rdc.findRowsInDoc(self.ODoc) #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi # DS2MPXML DS2MPXML = DS2PageXMLConvertor() lPageXml = DS2MPXML.run(self.ODoc.getDom()) if lPageXml != []: # if DS2MPXML.bMultiPages: newDoc = MultiPageXml.makeMultiPageXmlMemory( map(lambda xy: xy[0], lPageXml)) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) newDoc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) # else: # DS2MPXML.storePageXmlSetofFiles(lPageXml) return #upload # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 self.upLoadDocument(colid, coldir, docid, sNote='NLE workflow;table row done') ## apply HTR ## how to deal with specific dictionaries? ## here need to know the ontology and the template nbPages = 1 jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel, self.sDictName) bWait = True traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED'] # download where??? # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force # coldir is not right!! coldir must refer to the parent folder! self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)
class lineMiner(Component.Component): """ lineMiner class: a component to mine column-like page layout """ #DEFINE the version, usage and description of this particular component usage = "" version = "v.01" description = "description: line miner " #--- INIT ------------------------------------------------------------------------------------------------------------- def __init__(self): """ Always call first the Component constructor. """ Component.Component.__init__(self, "lineMiner", self.usage, self.version, self.description) # TH for comparing numerical features for X self.THNUMERICAL= 20 # use for evaluation self.THCOMP = 10 self.evalData= None self.bManual = False def setParams(self, dParams): """ Always call first the Component setParams Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value) """ Component.Component.setParams(self, dParams) if "pattern" in dParams: self.manualPattern = eval( dParams["pattern"]) self.bManual=True def mainLineMining(self,lPages): """ mine with incremental length """ import util.TwoDNeighbourhood as TwoDRel lVEdge = [] lLElts=[ [] for i in range(0,len(lPages))] for i,page in enumerate(lPages): lElts= page.getAllNamedObjects(XMLDSTEXTClass) for e in lElts: e.lnext=[] ## filter elements!!! # lElts = filter(lambda x:min(x.getHeight(),x.getWidth()) > 10,lElts) # lElts = filter(lambda x:x.getHeight() > 10,lElts) # lElts = list(filter(lambda x:x.getX() > 60,lElts)) lElts.sort(key=lambda x:x.getY()) lLElts[i]=lElts lVEdge = TwoDRel.findVerticalNeighborEdges(lElts) for a,b in lVEdge: a.lnext.append( b ) for i,page, in enumerate(lPages): lElts= lLElts[i] for elt in lElts: ### need of more abstract features: justified, center, left, right + numerical elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['x','2','xc','text'],myLevel=XMLDSTEXTClass) elt.computeSetofFeatures() seqGen = sequenceMiner() seqGen.setMaxSequenceLength(1) seqGen.setSDC(0.7) # related to noise level AND STRUCTURES (if many columns) _ = seqGen.featureGeneration(lElts,2) seqGen.setObjectLevel(XMLDSTEXTClass) # for registration: needs to be replaced by ._lRegValues print("sequence of elements and their features:") for elt in lElts: elt.lFeatureForParsing=elt.getSetofFeatures() # print (elt, elt.lFeatureForParsing) lTerminalTemplates=[] lCurList=lElts[:] for iLen in range(1,3): lCurList,lTerminalTemplates = self.mineLineFeature(seqGen,lCurList,lTerminalTemplates,iLen) del seqGen def mineLineFeature(self,seqGen,lCurList,lTerminalTemplates,iLen): """ get a set of lines and mine them """ seqGen.setMinSequenceLength(iLen) seqGen.setMaxSequenceLength(iLen) print( '***'*20, iLen) seqGen.bDebug = False for elt in lCurList: if elt.getSetofFeatures() is None: elt.resetFeatures() elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass) elt.computeSetofFeatures() elt.lFeatureForParsing=elt.getSetofFeatures() else: elt.setSequenceOfFeatures(elt.lFeatureForParsing) # print elt, elt.getSetofFeatures() lSortedFeatures = seqGen.featureGeneration(lCurList,2) for cf in lSortedFeatures: cf.setWeight(len((cf.getNodes()))) # for x in lCurList: # print( x, x.getCanonicalFeatures()) lmaxSequence = seqGen.generateItemsets(lCurList) seqGen.bDebug = False lSeq, _ = seqGen.generateMSPSData(lmaxSequence,lSortedFeatures + lTerminalTemplates,mis = 0.01) lPatterns = seqGen.miningSequencePrefixScan(lSeq) # model = PrefixSpan.train(lSeq, minSupport = 0.1, maxPatternLength = 10) # result = model.freqSequences().collect() # result.sort(key=lambda x:x.freq) # lPatterns=[] # for fs in result: # if fs.freq > 1: # # for i in fs.sequence: # # for x in i: # # print(x,x.getValue()) # # i.sort(key=lambda x:x.getValue()) # lPatterns.append((fs.sequence,fs.freq)) # lPatterns = list(filter(lambda p_s:len(p_s[0]) >= seqGen.getMinSequenceLength() and len(p_s[0]) <= seqGen.getMaxSequenceLength(),lPatterns)) # lPatterns = seqGen.beginMiningSequences(lSeq,lSortedFeatures,lMIS) if lPatterns is None: return lCurList, lTerminalTemplates lPatterns.sort(key=lambda xy:xy[1], reverse=True) print ("List of patterns and their support:") for p,support in lPatterns: if support >= 1: print (p, support) seqGen.THRULES = 0.95 lSeqRules = seqGen.generateSequentialRules(lPatterns) " here store features which are redundant and consider only the core feature" _,dCP = self.getPatternGraph(lSeqRules) dTemplatesCnd = self.analyzeListOfPatterns(lPatterns,dCP) lFullTemplates,lTerminalTemplates,tranprob = seqGen.testTreeKleeneageTemplates(dTemplatesCnd, lCurList) ## here we have a graph; second : is it useful here to correct noise?? ## allows for selecting templates ? self.selectFinalTemplates(lTerminalTemplates,tranprob,lCurList) ## store parsed sequences in mytemplate ### patterns are competing: generate a set of parsing ??? for mytemplate in lFullTemplates[:1]: # dTemplatesCnd.keys(): # for _,_, mytemplate in dTemplatesCnd[templateType][:1]: mytemplate.print_() # print '___'*30 # print lCurList isKleenePlus,_,lCurList = seqGen.parseWithTreeTemplate(mytemplate,lCurList,bReplace=True) for elt in lCurList: if elt.getSetofFeatures() is None: elt.resetFeatures() elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass) elt.computeSetofFeatures() elt.lFeatureForParsing=elt.getSetofFeatures() else: elt.setSequenceOfFeatures(elt.lFeatureForParsing) # self.printTreeView(lCurList) # print 'curList:',lCurList # print len(lCurList) print( "final hierarchy") self.printTreeView(lCurList) # lRegions= self.getRegionsFromStructure(page,lCurList) # store all interation # lPageRegions.append((page,lRegions,lCurList)) return lCurList, lTerminalTemplates def mineLineFeatureBasic(self,lPages): """ get a set of lines and mine them """ import util.TwoDNeighbourhood as TwoDRel lPageRegions = [] lVEdge = [] lLElts=[ [] for i in range(0,len(lPages))] for i,page in enumerate(lPages): lElts= page.getAllNamedObjects(XMLDSTEXTClass) for e in lElts: e.lnext=[] ## filter elements!!! lElts = list(filter(lambda x:min(x.getHeight(),x.getWidth()) > 10,lElts)) # lElts = filter(lambda x:x.getHeight() > 10,lElts) # lElts = list(filter(lambda x:x.getX() > 100,lElts)) lElts.sort(key=lambda x:x.getY()) lLElts[i]=lElts lVEdge = TwoDRel.findVerticalNeighborEdges(lElts) for a,b in lVEdge: a.lnext.append( b ) for i,page, in enumerate(lPages): lElts= lLElts[i] for elt in lElts: elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['x','x2','xc','text'],myLevel=XMLDSTEXTClass) elt.computeSetofFeatures() # print elt.getSetofFeatures() seqGen = sequenceMiner() seqGen.setMaxSequenceLength(1) seqGen.setSDC(0.5) # related to noise level AND STRUCTURES (if many columns) lSortedFeatures = seqGen.featureGeneration(lElts,2) seqGen.setObjectLevel(XMLDSTEXTClass) # for registration: needs to be replaced by ._lRegValues print( "sequence of elements and their features:") for elt in lElts: elt.lFeatureForParsing=elt.getSetofFeatures() print( elt, elt.lFeatureForParsing) icpt=0 lTerminalTemplates = [] lCurList= lElts # print len(lCurList) while icpt < 2: # seqGen.bDebug = False ## generate sequences if icpt > 0: seqGen.setMinSequenceLength(2) seqGen.setMaxSequenceLength(2) print( '***'*20, icpt) seqGen.bDebug = False for elt in lCurList: if elt.getSetofFeatures() is None: elt.resetFeatures() elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass) elt.computeSetofFeatures() elt.lFeatureForParsing=elt.getSetofFeatures() else: elt.setSequenceOfFeatures(elt.lFeatureForParsing) lSortedFeatures = seqGen.featureGeneration(lCurList,2) lmaxSequence = seqGen.generateItemsets(lCurList) seqGen.bDebug = False lSeq, lMIS = seqGen.generateMSPSData(lmaxSequence,lSortedFeatures + lTerminalTemplates,mis = 0.2) lPatterns = seqGen.miningSequencePrefixScan(lSeq) if lPatterns is None: return None # lPatterns = seqGen.beginMiningSequences(lSeq,lSortedFeatures,lMIS) lPatterns.sort(key=lambda xy:xy[1], reverse=True) print( "List of patterns and their support:") for p,support in lPatterns: if support >= 1: print (p, support) seqGen.THRULES = 0.95 lSeqRules = seqGen.generateSequentialRules(lPatterns) " here store features which are redundant and consider only the core feature" _,dCP = self.getPatternGraph(lSeqRules) dTemplatesCnd = self.analyzeListOfPatterns(lPatterns,dCP,icpt) lFullTemplates,lTerminalTemplates,tranprob = seqGen.testTreeKleeneageTemplates(dTemplatesCnd, lCurList) # lFullTemplates = seqGen.testTreeKleeneageTemplates(dTemplatesCnd, lCurList) # print tranprob # print lTerminalTemplates ## here we have a graph; second : is it useful here to correct noise?? ## allows for selecting templates ? self.selectFinalTemplates(lTerminalTemplates,tranprob,lCurList) ## store parsed sequences in mytemplate ### patterns are competing: generate a set of parsing ??? for mytemplate in lFullTemplates[:10]: # dTemplatesCnd.keys(): # for _,_, mytemplate in dTemplatesCnd[templateType][:1]: mytemplate.print_() # print '___'*30 # print lCurList isKleenePlus,_,lCurList = seqGen.parseWithTreeTemplate(mytemplate,lCurList,bReplace=True) for elt in lCurList: if elt.getSetofFeatures() is None: elt.resetFeatures() elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass) elt.computeSetofFeatures() elt.lFeatureForParsing = elt.getSetofFeatures() else: elt.setSequenceOfFeatures(elt.lFeatureForParsing) # self.printTreeView(lCurList) icpt +=1 # print 'curList:',lCurList # print len(lCurList) print ("final hierarchy") self.printTreeView(lCurList) # lRegions= self.getRegionsFromStructure(page,lCurList) # store all interation # lPageRegions.append((page,lRegions,lCurList)) return lPageRegions def selectFinalTemplates(self,lTemplates,transProb,lElts): """ apply viterbi to select best sequence of templates """ import spm.viterbi as viterbi if lTemplates == []: return None def buildObs(lTemplates,lElts): """ build observation prob """ N = len(lTemplates) + 1 obs = np.zeros((N,len(lElts)), dtype=np.float16) +10e-3 for i,temp in enumerate(lTemplates): for j,elt in enumerate(lElts): # how to dela with virtual nodes try: _, _, score= temp.registration(elt) except : score =1 if score == -1: score= 0.0 obs[i,j]= score if np.isinf(obs[i,j]): obs[i,j] = 64000 if np.isnan(obs[i,j]): obs[i,j] = 0.0 # print i,j,elt,elt.lX, temp,score #add no-template:-1 return obs / np.amax(obs) N= len(lTemplates) + 1 initialProb = np.ones(N) initialProb = np.reshape(initialProb,(N,1)) obs = buildObs(lTemplates,lElts) np.set_printoptions(precision= 3, linewidth =1000) # print "transProb" # print transProb # print # print obs d = viterbi.Decoder(initialProb, transProb, obs) states,score = d.Decode(np.arange(len(lElts))) # add empty template (last one in state) lTemplates.append(None) print (states, score) #assign to each elt the template assigned by viterbi for i,elt, in enumerate(lElts): # try: print elt,elt.lX, lTemplates[states[i]] # except: print elt, elt.lX, 'no template' mytemplate= lTemplates[states[i]] elt.resetTemplate() if mytemplate is not None: elt.addTemplate(mytemplate) try: registeredPoints, lMissing, score= mytemplate.registration(elt) except: registeredPoints = None if registeredPoints: # print registeredPoints, lMissing , score if lMissing != []: registeredPoints.extend(zip(lMissing,lMissing)) registeredPoints.sort(key=lambda xy:xy[1].getValue()) lcuts = map(lambda refcut:refcut[1],registeredPoints) ## store features for the final parsing!!! # print elt, lcuts # elt.addVSeparator(mytemplate,lcuts) # return the new list with kleenePlus elts for next iteration ## reparse ?? YES using the featureSet given by viterbi -> create an objectClass per kleenePlus element: objects: sub tree # print lTemplates[0] # self.parseWithTemplate(lTemplates[0], lElts) # elt = template return score def createItemSetFromNext(self,lElts,iLen): """ create itemset of length iLen using .lnext structures """ def getKleenePlusFeatures(self,lElts): """ select KleenePlus elements based on .next (only possible for unigrams) """ dFreqFeatures={} dKleenePlusFeatures = {} lKleenePlus=[] for elt in lElts: for fea in elt.getSetofFeatures(): try:dFreqFeatures[fea] +=1 except KeyError:dFreqFeatures[fea] = 1 for nextE in elt.next: if fea in nextE.getSetofFeatures(): try:dKleenePlusFeatures[fea].append((elt,nextE)) except KeyError:dKleenePlusFeatures[fea]=[(elt,nextE)] for fea in dFreqFeatures: try: dKleenePlusFeatures[fea] if len(dKleenePlusFeatures[fea]) >= 0.5 * dFreqFeatures[fea]: lKleenePlus.append(fea) except: pass return lKleenePlus def computePatternScore(self,pattern): """ consider the frequency of the pattern and the weights of the features """ fScore = 0 #terminal if not isinstance(pattern, list): fScore += pattern.getCanonical().getWeight() else: for child in pattern: fScore += self.computePatternScore(child) # print 'score:',pattern ,fScore return fScore def analyzeListOfPatterns(self,lPatterns,dCA,): """ select patterns with no ancestor other criteria ? if many with similar frequency: sort using computePatternScore? """ # reorder lPatterns considering feature weights and # of elements (for equally frequent patterns) # lPatterns.sort(key=lambda (x,y):self.computePatternScore(x),reverse=True) # for x,y in lPatterns: # print x,y,self.computePatternScore(x) dTemplatesTypes = {} for pattern,support in filter(lambda xy:xy[1]>1,lPatterns): try: dCA[str(pattern)] bSkip = True except KeyError:bSkip=False # if step > 0 and len(pattern) == 1: # bSkip=True if not bSkip: template = treeTemplateClass() template.setPattern(pattern) template.buildTreeFromPattern(pattern) template.setType('lineTemplate') try:dTemplatesTypes[template.__class__.__name__].append((pattern, support, template)) except KeyError: dTemplatesTypes[template.__class__.__name__] = [(pattern,support,template)] return dTemplatesTypes def processWithTemplate(self,lPattern,lPages): """ apply a known pattern """ def convertStringtoPattern(xcur): ## need to integrate the 'virtual level' lRes=[] for elt in xcur: if isinstance(elt,list): lRes.extend([convertStringtoPattern(elt)]) else: try: float(elt) f= featureObject() f.setName("x") f.setType(featureObject.NUMERICAL) f.setValue(elt) f.setObjectName(elt) f.setWeight(1) f.setTH(self.THNUMERICAL) except: f= featureObject() f.setName("f") f.setType(featureObject.EDITDISTANCE) f.setValue(elt) f.setTH(100.0) lRes.append(f) return lRes lfPattern =convertStringtoPattern(lPattern) # print lfPattern # create a template from lfPattern! mytemplate = treeTemplateClass() # mytemplate.setPattern(lfPattern) mytemplate.buildTreeFromPattern(lfPattern) mytemplate.print_() for page in lPages: seqGen = sequenceMiner() seqGen.setObjectLevel(XMLDSTEXTClass) lElts= page.getAllNamedObjects(XMLDSTEXTClass) for elt in lElts: elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['x','x2','text'],myLevel=XMLDSTEXTClass) elt.computeSetofFeatures() elt.lFeatureForParsing=elt.getSetofFeatures() lParsing,lNewSeq = seqGen.parseWithTreeTemplate(mytemplate,lElts,bReplace=True) del seqGen self.printTreeView(lNewSeq) # process lNewSeq: create the output data structure? def printTreeView(self,lElts,level=0): """ move to structuralMining? """ for elt in lElts: if elt.getAttribute('virtual'): print (" "*level, 'Node', elt.getAttribute('virtual')) self.printTreeView(elt.getObjects(),level+1) else: print (" " * level,elt.getContent()) # try:print (" "*level, elt, elt.getContent()) # except: print (elt._content[:3]) def getPatternGraph(self,lRules): """ create an graph which linsk exoannded patterns (a) -> (ab) (abc) -> (abcd) rule = (newPattern,item,i,pattern, fConfidence) RULE: [['x=19.0', 'x=48.0', 'x=345.0'], ['x=19.0', 'x=126.0', 'x=345.0']] => 'x=464.0'[0] (22.0/19.0 = 0.863636363636) can be used for tagging go up unitl no paretn """ dParentChild= {} dChildParent= {} for lhs, rhs, itemsetIndex, fullpattern, fConfidence in lRules: try:dParentChild[str(fullpattern)].append(lhs) except KeyError:dParentChild[str(fullpattern)] = [lhs] try:dChildParent[str(lhs)].append(fullpattern) except KeyError:dChildParent[str(lhs)] = [fullpattern] # # for bigram: extend to grammy # for child in dChildParent.keys(): # ltmp=[] # if len(eval(child)) == 2: # for parent in dChildParent[child]: # try: # ltmp.extend(dChildParent[str(parent)]) # except KeyError:pass # dChildParent[child].extend(ltmp) return dParentChild, dChildParent def generateTestOutput(self,lPages): """ create a run XML file """ root = etree.Element('DOCUMENT') self.evalData = etree.ElementTree(root) for page in lPages: domp = etree.Element('PAGE') domp.set('number',page.getAttribute('number')) root.append(domp) for sep in page.lVSeparator: # print( page.lVSeparator) domsep= etree.Element('SeparatorRegion') domp.append(domsep) domsep.set('x', str(sep[0])) return self.evalData def getRegionsFromStructure(self,page,lTree): """ tag the dom with what? """ lZone=[] srx, sry = 9e9, 9e9 srx2,sry2 = 0 , 0 lSubZone=[] for elt in lTree: if elt.getAttribute('virtual'): rx,ry,rx2,ry2,lsub = self.getRegionsFromStructure(page,elt.getObjects()) srx = min(srx,rx) sry = min(sry,ry) srx2= max(srx2,rx2) sry2= max(sry2,ry2) lSubZone.append([rx,ry,rx2,ry2,lsub]) else: lZone.append(elt) # get BB of the zone fMinX , fMinY = srx , sry fMaxX , fMaxY = srx2 , sry2 for e in lZone: if e.getX2() > fMaxX: fMaxX= e.getX2() if e.getY2() > fMaxY: fMaxY= e.getY2() if e.getX() < fMinX: fMinX= e.getX() if e.getY() < fMinY: fMinY= e.getY() # has substructure if srx != 9e9: return [ fMinX,fMinY,fMaxX,fMaxY , lSubZone] else: return [ fMinX,fMinY,fMaxX,fMaxY, []] def tagDom(self,page,region): """ tag page with region (x,y,x2,y2) """ fMinX , fMinY, fMaxX , fMaxY, ltail = region # new region node regionNode = etree.Element('REGION') page.getNode().append(regionNode) regionNode.set('x',str(fMinX)) regionNode.set('y',str(fMinY)) regionNode.set('height',str(fMaxY-fMinY)) regionNode.set('width',str(fMaxX - fMinX)) print() print (region) print (regionNode) [self.tagDom(page,tail) for tail in ltail] return regionNode #--- RUN --------------------------------------------------------------------------------------------------------------- def run(self): """ take a set of line in a page and mine it """ # use the lite version self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc,listPages=range(self.firstPage,self.lastPage+1)) self.lPages= self.ODoc.getPages() if self.bManual: self.processWithTemplate(self.manualPattern,self.lPages) else: self.mainLineMining(self.lPages) # lRes = self.mineLineFeature(self.lPages) # print lRes # returns the hierarchical set of elements (a list) # for page , region, tree in lRes: # self.tagDom(page, region) # return self.addTagProcessToMetadata(self.doc) return self.doc #--- TESTS ------------------------------------------------------------------------------------------------------------- # # Here we have the code used to test this component on a prepared testset (see under <ROOT>/test/common) # Do: python ../../src/common/TypicalComponent.py --test REF_TypicalComponent/ # def testeval(self,srefData,srunData, bVisual): """ Test found reftemplate and reftemplate """ cntOk = cntErr = cntMissed = 0 RefData = etree.XML(srefData.strip("\n").encode('utf-8')) RunData = etree.XML(srunData.strip("\n").encode('utf-8')) lRun = [] if RunData is not None: lpages = RunData.xpath('//%s' % ('PAGE')) for page in lpages: if page.get('reftemplate'): lRun.append(eval(page.get('reftemplate'))) lRef = [] lPages = RefData.xpath('//%s' % ('PAGE')) for page in lPages: if page.get('reftemplate'): lRef.append(eval(page.get('reftemplate'))) else: lRef.append([]) ltisRefsRunbErrbMiss= list() for i in range(0,len(lRef)): lRefCovered = [] if lRun[i] ==[]: runLen=0 else: posrun = lRun[i][0] runLen = len(lRun[i][posrun+1]) if lRef[i]==[]: refLen=0 refElt=None posref=None else: posref=lRef[i][0] refLen= len(lRef[i][posref+1]) curRun = curRef = 0 while curRun <= runLen - 1: # or curRef <= refLen -1: bErr, bMiss = False, False try: runElt = lRun[i][posrun+1][curRun] except IndexError: runElt = None # print '___',curRun,runElt curRef = 0 bFound = False while not bFound and curRef <= refLen - 1: try: refElt = lRef[i][posref+1][curRef] except IndexError: refElt = None # self.compareString(runElt,runElt) if runElt and refElt not in lRefCovered and self.testComparePageVertical(runElt, refElt): bFound = True lRefCovered.append(refElt) resRef=refElt else: curRef += 1 if bFound: if bVisual:print ("FOUND:", runElt, ' -- ', lRefCovered[-1]) cntOk += 1 curRun += 1 else: resRef='' curRun += 1 cntErr += 1 bErr = True # bMiss = True if bVisual:print ("ERROR:", runElt) ltisRefsRunbErrbMiss.append( (i, resRef, runElt,bErr, bMiss) ) if posref is not None: for ref in lRef[i][posref+1]: if ref not in lRefCovered: ltisRefsRunbErrbMiss.append( (i, ref, '',False, True) ) # add missed elements! # print 'missed', len(lRef[i][posref+1]) , len(lRefCovered), lRef[i][posref+1], lRefCovered cntMissed += 1#(len(lRef[i][posref+1]) - len(lRefCovered)) ltisRefsRunbErrbMiss.sort(key=lambda x_y_z_t_u:x_y_z_t_u[0]) return (cntOk, cntErr, cntMissed,ltisRefsRunbErrbMiss) def testRun(self, filename, outFile=None): """ testRun is responsible for running the component on this file and returning a string that reflects the result in a way that is understandable to a human and to a program. Nicely serialized Python data or XML is fine """ doc = self.loadDom(filename) self.doc= doc self.run() # doc.freeDoc() self.generateTestOutput(self.lPages) if outFile: self.writeDom(doc) return etree.tostring(self.evalData,encoding='unicode') def testCompare(self, srefData, srunData, bVisual=False): """ Our comparison is very simple: same or different. N We anyway return this in term of precision/recall If we want to compute the error differently, we must define out own testInit testRecord, testReport """ dicTestByTask = dict() dicTestByTask['EVAL']= self.testeval(srefData, srunData,bVisual) return dicTestByTask
class columnDetection(Component.Component): """ build table column """ usage = "" version = "v.01" description = "description: column Detection" #--- INIT ------------------------------------------------------------------------------------------------------------- def __init__(self): """ Always call first the Component constructor. """ Component.Component.__init__(self, "columnDetection", self.usage, self.version, self.description) self.colname = None self.docid = None self.do2DS = False # for --test self.bCreateRef = False self.evalData = None def setParams(self, dParams): """ Always call first the Component setParams Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value) """ Component.Component.setParams(self, dParams) # if dParams.has_key("coldir"): # self.colname = dParams["coldir"] if "docid" in dParams: self.docid = dParams["docid"] if "dsconv" in dParams: self.do2DS = dParams["dsconv"] if "createref" in dParams: self.bCreateRef = dParams["createref"] # def createCells(self, table): # """ # create new cells using BIESO tags # @input: tableObeject with old cells # @return: tableObject with BIES cells # @precondition: requires columns # # """ # for col in table.getColumns(): # lNewCells=[] # # keep original positions # col.resizeMe(XMLDSTABLECELLClass) # for cell in col.getCells(): # # print cell # curChunk=[] # lChunks = [] # # print map(lambda x:x.getAttribute('type'),cell.getObjects()) # # print map(lambda x:x.getID(),cell.getObjects()) # cell.getObjects().sort(key=lambda x:x.getY()) # for txt in cell.getObjects(): # # print txt.getAttribute("type") # if txt.getAttribute("type") == 'RS': # if curChunk != []: # lChunks.append(curChunk) # curChunk=[] # lChunks.append([txt]) # elif txt.getAttribute("type") in ['RI', 'RE']: # curChunk.append(txt) # elif txt.getAttribute("type") == 'RB': # if curChunk != []: # lChunks.append(curChunk) # curChunk=[txt] # elif txt.getAttribute("type") == 'RO': # ## add Other as well??? # curChunk.append(txt) # # if curChunk != []: # lChunks.append(curChunk) # # if lChunks != []: # # create new cells # table.delCell(cell) # irow= cell.getIndex()[0] # for i,c in enumerate(lChunks): # # print map(lambda x:x.getAttribute('type'),c) # #create a new cell per chunk and replace 'cell' # newCell = XMLDSTABLECELLClass() # newCell.setPage(cell.getPage()) # newCell.setParent(table) # newCell.setName(ds_xml.sCELL) # newCell.setIndex(irow+i,cell.getIndex()[1]) # newCell.setObjectsList(c) # newCell.resizeMe(XMLDSTEXTClass) # newCell.tagMe2() # for o in newCell.getObjects(): # o.setParent(newCell) # o.tagMe() # # table.addCell(newCell) # lNewCells.append(newCell) # cell.getNode().getparent().remove(cell.getNode()) # del(cell) # col.setObjectsList(lNewCells[:]) # [table.addCell(c) for c in lNewCells] # # # print col.tagMe() def createTable(self, page): """ BB of all elements? """ def processPage(self, page): from util.XYcut import mergeSegments ### skrinking to be done: lCuts, x1, x2 = mergeSegments( [(x.getX(), x.getX() + 20, x) for x in page.getAllNamedObjects(XMLDSTEXTClass)], 0) for x, y, cut in lCuts: ll = list(cut) ll.sort(key=lambda x: x.getY()) traceln(len(ll)) # traceln (list(map(lambda x:x.getContent(),ll))) def findColumnsInDoc(self, ODoc): """ find columns for each table in ODoc """ self.lPages = ODoc.getPages() # not always? # self.mergeLineAndCells(self.lPages) for page in self.lPages: traceln("page: %d" % page.getNumber()) self.processPage(page) def run(self, doc): """ load dom and find rows """ # conver to DS if needed if self.bCreateRef: if self.do2DS: dsconv = primaAnalysis() doc = dsconv.convert2DS(doc, self.docid) refdoc = self.createRef(doc) return refdoc # single ref per page refdoc = self.createRefPerPage(doc) return None if self.do2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) # self.ODoc.loadFromDom(self.doc,listPages = range(30,31)) self.findColumnsInDoc(self.ODoc) refdoc = self.createRef(self.doc) # print refdoc.serialize('utf-8', 1) if self.do2DS: # bakc to PageXml conv = DS2PageXMLConvertor() lPageXDoc = conv.run(self.doc) conv.storeMultiPageXml(lPageXDoc, self.getOutputFileName()) # print self.getOutputFileName() return None return self.doc ################ TEST ################## def testRun(self, filename, outFile=None): """ evaluate using ABP new table dataset with tablecell """ self.evalData = None doc = self.loadDom(filename) doc = self.run(doc) self.evalData = self.createRef(doc) if outFile: self.writeDom(doc) # return self.evalData.serialize('utf-8',1) return etree.tostring(self.evalData, encoding='unicode', pretty_print=True) def overlapX(self, zone): [a1, a2] = self.getX(), self.getX() + self.getWidth() [b1, b2] = zone.getX(), zone.getX() + zone.getWidth() return min(a2, b2) >= max(a1, b1) def overlapY(self, zone): [a1, a2] = self.getY(), self.getY() + self.getHeight() [b1, b2] = zone.getY(), zone.getY() + zone.getHeight() return min(a2, b2) >= max(a1, b1) def signedRatioOverlap(self, z1, z2): """ overlap self and zone return surface of self in zone """ [x1, y1, h1, w1] = z1.getX(), z1.getY(), z1.getHeight(), z1.getWidth() [x2, y2, h2, w2] = z2.getX(), z2.getY(), z2.getHeight(), z2.getWidth() fOverlap = 0.0 if self.overlapX(z2) and self.overlapY(z2): [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1] [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2] s1 = w1 * h1 # possible ? if s1 == 0: s1 = 1.0 #intersection nx1 = max(x11, x21) nx2 = min(x12, x22) ny1 = max(y11, y21) ny2 = min(y12, y22) h = abs(nx2 - nx1) w = abs(ny2 - ny1) inter = h * w if inter > 0: fOverlap = inter / s1 else: # if overX and Y this is not possible ! fOverlap = 0.0 return fOverlap def findSignificantOverlap(self, TOverlap, ref, run): """ return """ pref, rowref = ref prun, rowrun = run if pref != prun: return False return rowref.ratioOverlap(rowrun) >= TOverlap def testCPOUM(self, TOverlap, srefData, srunData, bVisual=False): """ TOverlap: Threshols used for comparing two surfaces Correct Detections: under and over segmentation? """ cntOk = cntErr = cntMissed = 0 RefData = etree.XML(srefData.strip("\n").encode('utf-8')) RunData = etree.XML(srunData.strip("\n").encode('utf-8')) # try: # RunData = libxml2.parseMemory(srunData.strip("\n"), len(srunData.strip("\n"))) # except: # RunData = None # return (cntOk, cntErr, cntMissed) lRun = [] if RunData: lpages = RunData.xpath('//%s' % ('PAGE')) for page in lpages: pnum = page.get('number') #record level! lRows = page.xpath(".//%s" % ("ROW")) lORows = map(lambda x: XMLDSTABLEROWClass(0, x), lRows) for row in lORows: row.fromDom(row._domNode) row.setIndex(row.getAttribute('id')) lRun.append((pnum, row)) print(lRun) lRef = [] lPages = RefData.xpath('//%s' % ('PAGE')) for page in lPages: pnum = page.get('number') lRows = page.xpath(".//%s" % ("ROW")) lORows = map(lambda x: XMLDSTABLEROWClass(0, x), lRows) for row in lORows: row.fromDom(row._domNode) row.setIndex(row.getAttribute('id')) lRef.append((pnum, row)) refLen = len(lRef) # bVisual = True ltisRefsRunbErrbMiss = list() lRefCovered = [] for i in range(0, len(lRun)): iRef = 0 bFound = False bErr, bMiss = False, False runElt = lRun[i] # print '\t\t===',runElt while not bFound and iRef <= refLen - 1: curRef = lRef[iRef] if runElt and curRef not in lRefCovered and self.findSignificantOverlap( TOverlap, runElt, curRef): bFound = True lRefCovered.append(curRef) iRef += 1 if bFound: if bVisual: print("FOUND:", runElt, ' -- ', lRefCovered[-1]) cntOk += 1 else: curRef = '' cntErr += 1 bErr = True if bVisual: print("ERROR:", runElt) if bFound or bErr: ltisRefsRunbErrbMiss.append( (int(runElt[0]), curRef, runElt, bErr, bMiss)) for i, curRef in enumerate(lRef): if curRef not in lRefCovered: if bVisual: print("MISSED:", curRef) ltisRefsRunbErrbMiss.append( (int(curRef[0]), curRef, '', False, True)) cntMissed += 1 ltisRefsRunbErrbMiss.sort(key=lambda xyztu: xyztu[0]) # print cntOk, cntErr, cntMissed,ltisRefsRunbErrbMiss return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss) def testCompare(self, srefData, srunData, bVisual=False): """ as in Shahad et al, DAS 2010 Correct Detections Partial Detections Over-Segmented Under-Segmented Missed False Positive """ dicTestByTask = dict() dicTestByTask['T50'] = self.testCPOUM(0.50, srefData, srunData, bVisual) # dicTestByTask['T75']= self.testCPOUM(0.750,srefData,srunData,bVisual) # dicTestByTask['T100']= self.testCPOUM(0.50,srefData,srunData,bVisual) # dicTestByTask['FirstName']= self.testFirstNameRecord(srefData, srunData,bVisual) # dicTestByTask['Year']= self.testYear(srefData, srunData,bVisual) return dicTestByTask def createColumnsWithCuts(self, lXCuts, table, tableNode, bTagDoc=False): """ create column dom node """ prevCut = None lXCuts.sort() for index, cut in enumerate(lXCuts): # first correspond to the table: no rpw if prevCut is not None: colNode = etree.Element("COL") tableNode.append(colNode) colNode.set('x', str(prevCut)) colNode.set('width', "{:.2f}".format(cut - prevCut)) colNode.set('y', str(table.getY())) colNode.set('height', str(table.getHeight())) colNode.set('id', str(index - 1)) prevCut = cut #last cut = table.getX2() colNode = etree.Element("COL") tableNode.append(colNode) colNode.set('x', "{:.2f}".format(prevCut)) colNode.set('width', "{:.2f}".format(cut - prevCut)) colNode.set('y', str(table.getY())) colNode.set('height', str(table.getHeight())) colNode.set('id', str(index)) def createRef(self, doc): """ create a ref file from the xml one """ self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(doc, listPages=range(self.firstPage, self.lastPage + 1)) root = etree.Element("DOCUMENT") refdoc = etree.ElementTree(root) for page in self.ODoc.getPages(): #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0"> pageNode = etree.Element('PAGE') pageNode.set("number", page.getAttribute('number')) pageNode.set("pagekey", os.path.basename(page.getAttribute('imageFilename'))) pageNode.set("width", page.getAttribute('width')) pageNode.set("height", page.getAttribute('height')) root.append(pageNode) lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: dCol = {} tableNode = etree.Element('TABLE') tableNode.set("x", table.getAttribute('x')) tableNode.set("y", table.getAttribute('y')) tableNode.set("width", table.getAttribute('width')) tableNode.set("height", table.getAttribute('height')) pageNode.append(tableNode) for cell in table.getAllNamedObjects(XMLDSTABLECELLClass): try: dCol[int(cell.getAttribute("col"))].append(cell) except KeyError: dCol[int(cell.getAttribute("col"))] = [cell] lXcuts = [] for colid in sorted(dCol.keys()): lXcuts.append( min(list(map(lambda x: x.getX(), dCol[colid])))) self.createColumnsWithCuts(lXcuts, table, tableNode) return refdoc def createRefPerPage(self, doc): """ create a ref file from the xml one for DAS 2018: one ref per graph(page) """ self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(doc, listPages=range(self.firstPage, self.lastPage + 1)) dRows = {} for page in self.ODoc.getPages(): #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0"> pageNode = etree.Element('PAGE') # pageNode.set("number",page.getAttribute('number')) #SINGLER PAGE pnum=1 pageNode.set("number", '1') pageNode.set("imageFilename", page.getAttribute('imageFilename')) pageNode.set("width", page.getAttribute('width')) pageNode.set("height", page.getAttribute('height')) root = etree.Element("DOCUMENT") refdoc = etree.ElementTree(root) root.append(pageNode) lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: tableNode = etree.Element('TABLE') tableNode.set("x", table.getAttribute('x')) tableNode.set("y", table.getAttribute('y')) tableNode.set("width", table.getAttribute('width')) tableNode.set("height", table.getAttribute('height')) pageNode.append(tableNode) for cell in table.getAllNamedObjects(XMLDSTABLECELLClass): try: dRows[int(cell.getAttribute("row"))].append(cell) except KeyError: dRows[int(cell.getAttribute("row"))] = [cell] lYcuts = [] for rowid in sorted(dRows.keys()): # print rowid, min(map(lambda x:x.getY(),dRows[rowid])) lYcuts.append( min(list(map(lambda x: x.getY(), dRows[rowid])))) self.createRowsWithCuts(lYcuts, table, tableNode) self.outputFileName = os.path.basename( page.getAttribute('imageFilename')[:-3] + 'ref') print(self.outputFileName) self.writeDom(refdoc, bIndent=True) return refdoc
class TableProcessing(Component.Component): usage = "" version = "v.01" description = "description: table layout analysis based on template" sCOL = "col" sMPXMLExtension = ".mpxml" def __init__(self): """ Always call first the Component constructor. """ Component.Component.__init__(self, "TableProcessing", self.usage, self.version, self.description) self.colid = None self.docid = None self.bFullCol = False # generate MPXML using Ext self.useExtForMPXML = False self.bRegenerateMPXML = False self.sRowModelName = None self.sRowModelDir = None self.sHTRmodel = None self.sDictName = None def setParams(self, dParams): """ Always call first the Component setParams Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value) """ Component.Component.setParams(self, dParams) if "coldir" in dParams: self.coldir = dParams["coldir"] if "colid" in dParams: self.colid = dParams["colid"] if "colid" in dParams: self.docid = dParams["docid"] if "useExt" in dParams: self.useExtForMPXML = dParams["useExt"] if 'regMPXML' in dParams: self.bRegenerateMPXML = True if "rowmodelname" in dParams: self.sRowModelName = dParams["rowmodelname"] if "rowmodeldir" in dParams: self.sRowModelDir = dParams["rowmodeldir"] if "htrmodel" in dParams: self.sHTRmodel = dParams["htrmodel"] if "dictname" in dParams: self.sDictName = dParams["dictname"] # Connection to Transkribus self.myTrKCient = None self.persist = False self.loginInfo = False if dParams.has_key("server"): self.server = dParams["server"] if dParams.has_key("persist"): self.persist = dParams["persist"] if dParams.has_key("login"): self.loginInfo = dParams["login"] def login(self, trnskrbs_client, trace=None, traceln=None): """ deal with the complicated login variants... -trace and traceln are optional print methods return True or raises an exception """ DEBUG = True bOk = False if self.persist: #try getting some persistent session token if DEBUG and trace: trace(" ---login--- Try reusing persistent session ... ") try: bOk = trnskrbs_client.reusePersistentSession() if DEBUG and traceln: traceln("OK!") except: if DEBUG and traceln: traceln("Failed") if not bOk: if self.loginInfo: login, pwd = self.loginInfo, self.pwd else: if trace: DEBUG and trace( " ---login--- no login provided, looking for stored credentials... " ) login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False) if DEBUG and traceln: traceln("OK") if DEBUG and traceln: trace(" ---login--- logging onto Transkribus as %s " % login) trnskrbs_client.auth_login(login, pwd) if DEBUG and traceln: traceln("OK") bOk = True return bOk def downloadCollection(self, colid, destDir, docid, bNoImg=True, bForce=False): """ download colID replace destDir by '.' ? """ destDir = "." # options.server, proxies, loggingLevel=logging.WARN) #download downloader = TranskribusDownloader(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) downloader.setSessionId(self.myTrKCient.getSessionId()) traceln("- Downloading collection %s to folder %s" % (colid, os.path.abspath(destDir))) # col_ts, colDir = downloader.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage) col_ts, colDir, ldocids, dFileListPerDoc = downloader.downloadCollection( colid, destDir, bForce=bForce, bNoImage=bNoImg, sDocId=docid) traceln("- Done") with open(os.path.join(colDir, "config.txt"), "w") as fd: fd.write("server=%s\nforce=%s\nstrict=%s\n" % (self.server, True, False)) downloader.generateCollectionMultiPageXml( os.path.join(colDir, TableProcessing.sCOL), dFileListPerDoc, False) traceln('- Done, see in %s' % colDir) return ldocids def upLoadDocument(self, colid, coldir, docid, sNote="", sTranscripExt='.mpxml'): """ download colID """ # options.server, proxies, loggingLevel=logging.WARN) #download # uploader = TranskribusTranscriptUploader(self.server,self.proxies) uploader = TranskribusDUTranscriptUploader( self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) uploader.setSessionId(self.myTrKCient.getSessionId()) traceln("- uploading document %s to collection %s" % (docid, colid)) uploader.uploadDocumentTranscript(colid, docid, os.path.join(coldir, sCOL), sNote, 'NLE Table', sTranscripExt, iVerbose=False) traceln("- Done") return def applyLA_URO(self, colid, docid, nbpages): """ apply textline finder """ # do the job... # if options.trp_doc: # trpdoc = json.load(codecs.open(options.trp_doc, "rb",'utf-8')) # docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc) traceln('process %s pages...' % nbpages) lretJobIDs = [] for i in range(1, nbpages + 1): LA = DoLAbatch(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) LA._trpMng.setSessionId(self.myTrKCient.getSessionId()) LA.setSessionId(self.myTrKCient.getSessionId()) _, sPageDesc = LA.buildDescription(colid, "%s/%s" % (docid, i)) sPageDesc = LA.jsonToXMLDescription(sPageDesc) _, lJobIDs = LA.run(colid, sPageDesc, "CITlabAdvancedLaJob", False) traceln(lJobIDs) lretJobIDs.extend(lJobIDs) traceln("- LA running for page %d job:%s" % (i, lJobIDs)) return lretJobIDs def applyHTRForRegions(self, colid, docid, nbpages, modelname, dictionary): """ apply an htr model at region level """ htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId()) htrComp.setSessionId(self.myTrKCient.getSessionId()) _, sPageDesc = htrComp.buildDescription(colid, "%s/%s" % (docid, nbpages)) sPages = "1-%d" % (nbpages) sModelID = None # get modelID lColModels = self.myTrKCient.listRnns(colid) for model in lColModels: # print model['htrId'], type(model['htrId']), modelname,type(modelname) if str(model['htrId']) == str(modelname): sModelID = model['htrId'] traceln('model id = %s' % sModelID) #some old? models do not have params field # try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params'])) # except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name'])) if sModelID == None: raise Exception, "no model ID found for %s" % (modelname) ret = htrComp.htrRnnDecode(colid, sModelID, dictionary, docid, sPageDesc, bDictTemp=False) traceln(ret) return ret def applyHTR(self, colid, docid, nbpages, modelname, dictionary): """ apply HTR on docid htr id is needed: we have htrmodename """ htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId()) htrComp.setSessionId(self.myTrKCient.getSessionId()) _, sPageDesc = htrComp.buildDescription(colid, "%s/%s" % (docid, nbpages)) sPages = "1-%d" % (nbpages) sModelID = None # get modelID lColModels = self.myTrKCient.listRnns(colid) for model in lColModels: # print model['htrId'], type(model['htrId']), modelname,type(modelname) if str(model['htrId']) == str(modelname): sModelID = model['htrId'] traceln('model id = %s' % sModelID) #some old? models do not have params field # try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params'])) # except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name'])) if sModelID == None: raise Exception, "no model ID found for %s" % (modelname) ret = htrComp.htrRnnDecode(colid, sModelID, dictionary, docid, sPageDesc, bDictTemp=False) traceln(ret) return ret def extractFileNamesFromMPXML(self, mpxmldoc): """ to insure correct file order ! duplicated form performCVLLA.py """ xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid)) lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page') # for i in lNd:print i return map( lambda x: "%s%s%s.xml" % (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd) def processDocument(self, coldir, colid, docid, dom=None): """ process a single document 1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml 2 python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form 3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml 6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py <model-name> <dictionary-name> 5400 17442 wait 8 python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force #covnert to ds 9 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate """ #create Transkribus client self.myTrKCient = TranskribusClient(sServerUrl=self.server, proxies={}, loggingLevel=logging.WARN) #login _ = self.login(self.myTrKCient, trace=trace, traceln=traceln) # self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True) ## load dom if dom is None: self.inputFileName = os.path.abspath( os.path.join(coldir, TableProcessing.sCOL, docid + TableProcessing.sMPXMLExtension)) mpxml_doc = self.loadDom() nbPages = MultiPageXml.getNBPages(mpxml_doc) else: # load provided mpxml mpxml_doc = dom nbPages = MultiPageXml.getNBPages(mpxml_doc) # ### table registration: need to compute/select??? the template # # perform LA separator, table registration, baseline with normalization # #python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form # tableregtool= LAProcessor() # # latool.setParams(dParams) # tableregtool.coldir = coldir # tableregtool.docid = docid # tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False # # creates xml and a new mpxml # mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc) # # # self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done') lJobIDs = self.applyLA_URO(colid, docid, nbPages) return bWait = True assert lJobIDs != [] jobid = lJobIDs[-1] traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED'] ## coldir??? self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True) ##STOP HERE FOR DAS newx testset: return # tag text for BIES cell #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 """ needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir, """ doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir) doer.load() ## needed predict at file level, and do not store dom, but return it rowpath = os.path.join(coldir, "col") BIESFiles = doer.predict([rowpath], docid) BIESDom = self.loadDom(BIESFiles[0]) # res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True) # MPXML2DS #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 dsconv = primaAnalysis() DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid) # create XMLDOC object self.ODoc = XMLDSDocument() self.ODoc.loadFromDom( DSBIESdoc) #,listPages = range(self.firstPage,self.lastPage+1)) # create row #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml rdc = RowDetection() rdc.findRowsInDoc(self.ODoc) #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi # DS2MPXML DS2MPXML = DS2PageXMLConvertor() lPageXml = DS2MPXML.run(self.ODoc.getDom()) if lPageXml != []: # if DS2MPXML.bMultiPages: newDoc = MultiPageXml.makeMultiPageXmlMemory( map(lambda xy: xy[0], lPageXml)) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) newDoc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) # else: # DS2MPXML.storePageXmlSetofFiles(lPageXml) return #upload # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 self.upLoadDocument(colid, coldir, docid, sNote='NLE workflow;table row done') ## apply HTR ## how to deal with specific dictionaries? ## here need to know the ontology and the template nbPages = 1 jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel, self.sDictName) bWait = True traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED'] # download where??? # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force # coldir is not right!! coldir must refer to the parent folder! self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True) #done!! # IE extr ## not here: specific to a usecas #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate def processCollection(self, coldir): """ process all files in a colelction need mpxml files """ lsDocFilename = sorted( glob.iglob( os.path.join(coldir, "*" + TableProcessing.sMPXMLExtension))) lDocId = [] for sDocFilename in lsDocFilename: sDocId = os.path.basename( sDocFilename)[:-len(TableProcessing.sMPXMLExtension)] try: docid = int(sDocId) lDocId.append(docid) except ValueError: traceln("Warning: folder %s : %s invalid docid, IGNORING IT" % (self.coldir, sDocId)) continue # process each document for docid in lDocId: traceln("Processing %s : %s " % (self.coldir, sDocId)) self.processDocument(self.colid, docid) traceln("\tProcessing done for %s " % (self.coldir, sDocId)) def processParameters(self): """ what to do with the parameters provided by the command line """ if self.colid is None: print('collection id missing!') sys.exit(1) self.bFullCol = self.docid != None if self.bRegenerateMPXML and self.docid is not None: l = glob.glob(os.path.join(self.coldir, sCOL, self.docid, "*.pxml")) doc = MultiPageXml.makeMultiPageXml(l) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) doc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) return doc return None def run(self): """ process at collection level or document level """ newMPXML = self.processParameters() if self.bFullCol is None: self.processCollection(self.colid) else: self.processDocument(self.coldir, self.colid, self.docid, newMPXML)
class TableProcessing(Component.Component): usage = "" version = "v.01" description = "description: table layout analysis based on template" sCOL = "col" sMPXMLExtension = ".mpxml" def __init__(self): """ Always call first the Component constructor. """ Component.Component.__init__(self, "TableProcessing", self.usage, self.version, self.description) self.colid = None self.docid = None self.bFullCol = False # generate MPXML using Ext self.useExtForMPXML = False self.bRegenerateMPXML = False self.sRowModelName = None self.sRowModelDir = None self.sHTRmodel = None self.sDictName = None def setParams(self, dParams): """ Always call first the Component setParams Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value) """ Component.Component.setParams(self, dParams) if "coldir" in dParams: self.coldir = dParams["coldir"] if "colid" in dParams: self.colid = dParams["colid"] if "colid" in dParams: self.docid = dParams["docid"] if "useExt" in dParams: self.useExtForMPXML = dParams["useExt"] if 'mergeTLC' in dParams: self.bUROCVLMerge = dParams["mergeTLC"] if 'regMPXML' in dParams: self.bRegenerateMPXML = dParams["regMPXML"] if "rowmodelname" in dParams: self.sRowModelName = dParams["rowmodelname"] if "rowmodeldir" in dParams: self.sRowModelDir = dParams["rowmodeldir"] if "htrmodel" in dParams: self.sHTRmodel = dParams["htrmodel"] if "dictname" in dParams: self.sDictName = dParams["dictname"] # Connection to Transkribus self.myTrKCient = None self.persist = False self.loginInfo = False if "server" in dParams: self.server = dParams["server"] if "persist" in dParams: self.persist = dParams["persist"] if "login" in dParams: self.loginInfo = dParams["login"] def login(self, trnskrbs_client, trace=None, traceln=None): """ deal with the complicated login variants... -trace and traceln are optional print methods return True or raises an exception """ DEBUG = True bOk = False if self.persist: #try getting some persistent session token if DEBUG and trace: trace(" ---login--- Try reusing persistent session ... ") try: bOk = trnskrbs_client.reusePersistentSession() if DEBUG and traceln: traceln("OK!") except: if DEBUG and traceln: traceln("Failed") if not bOk: if self.loginInfo: login, pwd = self.loginInfo, self.pwd else: if trace: DEBUG and trace( " ---login--- no login provided, looking for stored credentials... " ) login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False) if DEBUG and traceln: traceln("OK") if DEBUG and traceln: trace(" ---login--- logging onto Transkribus as %s " % login) trnskrbs_client.auth_login(login, pwd) if DEBUG and traceln: traceln("OK") bOk = True return bOk def downloadCollection(self, colid, destDir, docid, bNoImg=True, bForce=False): """ download colID replace destDir by '.' ? """ destDir = "." # options.server, proxies, loggingLevel=logging.WARN) #download downloader = TranskribusDownloader(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) downloader.setSessionId(self.myTrKCient.getSessionId()) traceln("- Downloading collection %s to folder %s" % (colid, os.path.abspath(destDir))) # col_ts, colDir = downloader.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage) col_ts, colDir, ldocids, dFileListPerDoc = downloader.downloadCollection( colid, destDir, bForce=bForce, bNoImage=bNoImg, sDocId=docid) traceln("- Done") with open(os.path.join(colDir, "config.txt"), "w") as fd: fd.write("server=%s\nforce=%s\nstrict=%s\n" % (self.server, True, False)) downloader.generateCollectionMultiPageXml( os.path.join(colDir, TableProcessing.sCOL), dFileListPerDoc, False) traceln('- Done, see in %s' % colDir) return ldocids def upLoadDocument(self, colid, coldir, docid, sNote="", sTranscripExt='.mpxml'): """ download colID """ # options.server, proxies, loggingLevel=logging.WARN) #download # uploader = TranskribusTranscriptUploader(self.server,self.proxies) uploader = TranskribusDUTranscriptUploader( self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) uploader.setSessionId(self.myTrKCient.getSessionId()) traceln("- uploading document %s to collection %s" % (docid, colid)) uploader.uploadDocumentTranscript(colid, docid, os.path.join(coldir, sCOL), sNote, 'NLE Table', sTranscripExt, iVerbose=False) traceln("- Done") return def applyLA_URO(self, colid, docid, nbpages): """ apply textline finder """ # do the job... # if options.trp_doc: # trpdoc = json.load(codecs.open(options.trp_doc, "rb",'utf-8')) # docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc) traceln('process %s pages...' % nbpages) lretJobIDs = [] for i in range(1, nbpages + 1): LA = DoLAbatch(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) LA._trpMng.setSessionId(self.myTrKCient.getSessionId()) LA.setSessionId(self.myTrKCient.getSessionId()) _, sPageDesc = LA.buildDescription(colid, "%s/%s" % (docid, i)) sPageDesc = LA.jsonToXMLDescription(sPageDesc) _, lJobIDs = LA.run(colid, sPageDesc, "CITlabAdvancedLaJob", False) traceln(lJobIDs) lretJobIDs.extend(lJobIDs) traceln("- LA running for page %d job:%s" % (i, lJobIDs)) return lretJobIDs def applyHTRForRegions(self, colid, docid, nbpages, modelname, dictionary): """ apply an htr model at region level """ htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId()) htrComp.setSessionId(self.myTrKCient.getSessionId()) _, sPageDesc = htrComp.buildDescription(colid, "%s/%s" % (docid, nbpages)) sPages = "1-%d" % (nbpages) sModelID = None # get modelID lColModels = self.myTrKCient.listRnns(colid) for model in lColModels: # print model['htrId'], type(model['htrId']), modelname,type(modelname) if str(model['htrId']) == str(modelname): sModelID = model['htrId'] traceln('model id = %s' % sModelID) #some old? models do not have params field # try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params'])) # except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name'])) if sModelID == None: raise Exception("no model ID found for %s" % (modelname)) ret = htrComp.htrRnnDecode(colid, sModelID, dictionary, docid, sPageDesc, bDictTemp=False) traceln(ret) return ret def applyHTR(self, colid, docid, nbpages, modelname, dictionary): """ apply HTR on docid htr id is needed: we have htrmodename """ htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies()) htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId()) htrComp.setSessionId(self.myTrKCient.getSessionId()) _, sPageDesc = htrComp.buildDescription(colid, "%s/%s" % (docid, nbpages)) sPages = "1-%d" % (nbpages) sModelID = None # get modelID lColModels = self.myTrKCient.listRnns(colid) for model in lColModels: # print model['htrId'], type(model['htrId']), modelname,type(modelname) if str(model['htrId']) == str(modelname): sModelID = model['htrId'] traceln('model id = %s' % sModelID) #some old? models do not have params field # try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params'])) # except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name'])) if sModelID == None: raise Exception("no model ID found for %s" % (modelname)) ret = htrComp.htrRnnDecode(colid, sModelID, dictionary, docid, sPageDesc, bDictTemp=False) traceln(ret) return ret def overlapX(self, zoneA, zoneB): [x11, y11, x12, y12] = zoneA.getBoundingBox( ) #self.getX(),self.getY(),self.getHeight(),self.getWidth() [x21, y21, x22, y22] = zoneB.getBoundingBox() [a1, a2] = x11, x12 [b1, b2] = x21, x22 #zoneB.getX(),zoneB.getX()+ zoneB.getWidth() return min(a2, b2) >= max(a1, b1) def overlapY(self, zoneA, zoneB): [x11, y11, x12, y12] = zoneA.getBoundingBox( ) #self.getX(),self.getY(),self.getHeight(),self.getWidth() [x21, y21, x22, y22] = zoneB.getBoundingBox() [a1, a2] = y11, y12 [b1, b2] = y22, y22 #zone.getY(),zone.getY() + zone.getHeight() return min(a2, b2) >= max(a1, b1) def signedOverlap(self, zoneA, zoneB): """ overlap self and zone return surface of self in zone """ [x11, y11, x12, y12] = zoneA.getBoundingBox( ) #self.getX(),self.getY(),self.getHeight(),self.getWidth() [x21, y21, x22, y22] = zoneB.getBoundingBox( ) #.getX(),zone.getY(),zone.getHeight(),zone.getWidth() w1 = x12 - x11 h1 = y12 - y11 fOverlap = 0.0 if self.overlapX(zoneA, zoneB) and self.overlapY(zoneA, zoneB): s1 = w1 * h1 # possible ? if s1 == 0: s1 = 1.0 #intersection nx1 = max(x11, x21) nx2 = min(x12, x22) ny1 = max(y11, y21) ny2 = min(y12, y22) h = abs(nx2 - nx1) w = abs(ny2 - ny1) inter = h * w if inter > 0: fOverlap = inter / s1 else: # if overX and Y this is not possible ! fOverlap = 0.0 return fOverlap def mergeBaselineCells(self, coldir, colid, docid): """ Take a file (pxml) with stuff processed on Transkribus Tale the CVL template tool xml (xml) merge them regenerate a mpxml """ xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid)) # print (xmlpath) mpxml = xmlpath + ".mpxml" mpxmldoc = etree.parse(mpxml) lxml = glob.glob(os.path.join(xmlpath, "*.xml")) pxmldoc = MultiPageXml.makeMultiPageXml(lxml) lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml")) mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml) lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page') lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page') assert len(lXMLPage) == len(lPXMLPage) for i, cvlpage in enumerate(lXMLPage): ## remove TextRegion from xcvlpage lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion') for tr in lTextRegions: tr.getparent().remove(tr) pxmlpage = lPXMLPage[i] lTL = [] lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion') for x in lTextRegions: lTL.extend(PageXml.getChildByName(x, 'TextLine')) ltable = PageXml.getChildByName(cvlpage, 'TableRegion') if len(ltable) == 0: raise "NO TABLE" lCells = PageXml.getChildByName(ltable[0], 'TableCell') lC = [Polygon(PageXml.getPointList(c)) for c in lCells] lT = [Polygon(PageXml.getPointList(t)) for t in lTL] for i, tl in enumerate(lT): ## normalization lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords') lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline') coordB = lCoordsB[0] coord = lCoordsPoints[0] iHeight = 30 # in pixel x1, y1, x2, y2 = Polygon( PageXml.getPointList(coordB)).getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) tl = Polygon(PageXml.getPointList(coordB)) lOverlap = [] for _, c in enumerate(lC): # print (lCells[j].get('row'),lCells[j].get('col'), self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox()) lOverlap.append(self.signedOverlap( c, tl)) #.getBoundingBox())) ## region of the same size as the textline # print (j,max(lOverlap),lOverlap.index(max(lOverlap))) if max(lOverlap) == 0: region = PageXml.createPageXmlNode('TextRegion') cvlpage.append(region) region.append(lTL[i]) else: cell = lCells[lOverlap.index(max(lOverlap))] cell.append(lTL[i]) # print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext())) pxmldoc.write(mpxml) """ lOverlap=[] for region in lRegions: lOverlap.append(self.signedRatioOverlap(region)) if max(lOverlap) == 0: return None return lRegions[lOverlap.index(max(lOverlap))] """ """ fOverlap = 0.0 if self.overlapX(zone) and self.overlapY(zone): [x11,y11,x12,y12] = [x1,y1,x1+w1,y1+h1] [x21,y21,x22,y22] = [x2,y2,x2+w2,y2+h2] s1 = w1 * h1 # possible ? if s1 == 0: s1 = 1.0 #intersection nx1 = max(x11,x21) nx2 = min(x12,x22) ny1 = max(y11,y21) ny2 = min(y12,y22) h = abs(nx2 - nx1) w = abs(ny2 - ny1) inter = h * w if inter > 0 : fOverlap = inter/s1 else: # if overX and Y this is not possible ! fOverlap = 0.0 return fOverlap """ def extractFileNamesFromMPXML(self, mpxmldoc): """ to insure correct file order ! duplicated form performCVLLA.py """ xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid)) lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page') # for i in lNd:print i return map( lambda x: "%s%s%s.xml" % (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd) def processDocument(self, coldir, colid, docid, dom=None): """ process a single document 1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml 2 python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form 3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml 6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py <model-name> <dictionary-name> 5400 17442 wait 8 python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force #covnert to ds 9 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate """ #create Transkribus client self.myTrKCient = TranskribusClient(sServerUrl=self.server, proxies={}, loggingLevel=logging.WARN) #login _ = self.login(self.myTrKCient, trace=trace, traceln=traceln) # self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True) ## load dom if dom is None: self.inputFileName = os.path.abspath( os.path.join(coldir, TableProcessing.sCOL, docid + TableProcessing.sMPXMLExtension)) mpxml_doc = self.loadDom() nbPages = MultiPageXml.getNBPages(mpxml_doc) else: # load provided mpxml mpxml_doc = dom nbPages = MultiPageXml.getNBPages(mpxml_doc) # ### table registration: need to compute/select??? the template # # perform LA separator, table registration, baseline with normalization # #python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form # tableregtool= LAProcessor() # # latool.setParams(dParams) # tableregtool.coldir = coldir # tableregtool.docid = docid # tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False # # creates xml and a new mpxml # mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc) # # # self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done') lJobIDs = self.apply_URO(colid, docid, nbPages) return bWait = True assert lJobIDs != [] jobid = lJobIDs[-1] traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED'] ## coldir??? self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True) ##STOP HERE FOR DAS newx testset: return # tag text for BIES cell #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 """ needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir, """ doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir) doer.load() ## needed predict at file level, and do not store dom, but return it rowpath = os.path.join(coldir, "col") BIESFiles = doer.predict([rowpath], docid) BIESDom = self.loadDom(BIESFiles[0]) # res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True) # MPXML2DS #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 dsconv = primaAnalysis() DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid) # create XMLDOC object self.ODoc = XMLDSDocument() self.ODoc.loadFromDom( DSBIESdoc) #,listPages = range(self.firstPage,self.lastPage+1)) # create row #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml rdc = RowDetection() rdc.findRowsInDoc(self.ODoc) #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi # DS2MPXML DS2MPXML = DS2PageXMLConvertor() lPageXml = DS2MPXML.run(self.ODoc.getDom()) if lPageXml != []: # if DS2MPXML.bMultiPages: newDoc = MultiPageXml.makeMultiPageXmlMemory( map(lambda xy: xy[0], lPageXml)) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) newDoc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) # else: # DS2MPXML.storePageXmlSetofFiles(lPageXml) return #upload # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 self.upLoadDocument(colid, coldir, docid, sNote='NLE workflow;table row done') ## apply HTR ## how to deal with specific dictionaries? ## here need to know the ontology and the template ## OPTION: put it after LA on server (just one download needed ) nbPages = 1 jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel, self.sDictName) bWait = True traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED'] # download where??? # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force # coldir is not right!! coldir must refer to the parent folder! self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True) #done!! # IE extr ## not here: specific to a usecas #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate def processCollection(self, coldir): """ process all files in a colelction need mpxml files """ lsDocFilename = sorted( glob.iglob( os.path.join(coldir, "*" + TableProcessing.sMPXMLExtension))) lDocId = [] for sDocFilename in lsDocFilename: sDocId = os.path.basename( sDocFilename)[:-len(TableProcessing.sMPXMLExtension)] try: docid = int(sDocId) lDocId.append(docid) except ValueError: traceln("Warning: folder %s : %s invalid docid, IGNORING IT" % (self.coldir, sDocId)) continue # process each document for docid in lDocId: traceln("Processing %s : %s " % (self.coldir, sDocId)) self.processDocument(self.colid, docid) traceln("\tProcessing done for %s " % (self.coldir, sDocId)) def processParameters(self): """ what to do with the parameters provided by the command line """ if self.colid is None: print('collection id missing!') sys.exit(1) self.bFullCol = self.docid != None if self.bRegenerateMPXML and self.docid is not None: l = glob.glob(os.path.join(self.coldir, sCOL, self.docid, "*.pxml")) doc = MultiPageXml.makeMultiPageXml(l) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) doc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) return doc return None def run(self): """ process at collection level or document level """ newMPXML = self.processParameters() if self.bFullCol is None: self.processCollection(self.colid) else: if self.bUROCVLMerge: self.mergeBaselineCells(self.coldir, self.colid, self.docid) return self.processDocument(self.coldir, self.colid, self.docid, newMPXML)
class IETest(Component.Component): """ """ usage = "" version = "v.01" description = "description: Information Extraction Tool for the ABP collection (READ project)" #--- INIT ------------------------------------------------------------------------------------------------------------- def __init__(self): """ Always call first the Component constructor. """ Component.Component.__init__(self, "ABP_IE", self.usage, self.version, self.description) self.usage = self.usage = "python %prog" + self.usageComponent self.colname = None self.docid = None self.sTemplate = None self.BuseStoredTemplate = False # HTR model id self.htrModelID = None # IE model self.sModelDir = None self.sModelName = None self.lcsTH = 75 self.page2DS = False # for --test self.evalData = None def setParams(self, dParams): """ Always call first the Component setParams Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value) """ Component.Component.setParams(self, dParams) if "coldir" in dParams: self.colname = dParams["coldir"] if "docid" in dParams: self.docid = dParams["docid"] if "htrid" in dParams: self.htrModelID = dParams["htrid"] if "template" in dParams: self.sTemplate = dParams["template"] if "UseStoredTemplate" in dParams: self.BuseStoredTemplate = dParams["UseStoredTemplate"] if 'modelName' in dParams: self.sModelName = dParams['modelName'] if 'modelDir' in dParams: self.sModelDir = dParams['modelDir'] if "2DS" in dParams: self.page2DS = dParams['2DS'] if "LCSTH" in dParams: self.lcsTH = dParams['LCSTH'] def labelTable(self, table): """ toy example label columns with tags """ table.getColumns()[0].label() def findNameColumn(self, table): """ find the column which corresponds to the people names c """ self.bDebug = False #tag fields with template lColPos = {} lColInvName = {} for cell in table.getCells(): try: lColPos[cell.getIndex()[1]] except: lColPos[cell.getIndex()[1]] = [] if cell.getIndex()[1] < 5: for field in cell.getFields(): if field is not None: res = field.applyTaggers(cell) # res [ (token,label,score) ...] extractedValues = field.extractLabel(res) if extractedValues != []: # extractedValues = map(lambda offset,value,label,score:(value,score),extractedValues) extractedValues = list( map(lambda x: (x[1], x[3]), extractedValues)) field.setOffset(res[0]) field.setValue(extractedValues) # field.addValue(extractedValues) lColPos[cell.getIndex()[1]].append(field.getName()) try: lColInvName[field.getName()].append( cell.getIndex()[1]) except: lColInvName[field.getName()] = [ cell.getIndex()[1] ] if self.bDebug: print('foundXX:', field.getName(), field.getValue()) cell.resetField() return max(lColInvName['firstname'], key=lColInvName['firstname'].count) def extractData(self, table, myRecord, lTemplate): """ layout tag content [use scoping for propagating scoping: for tagging and for data scope fieldname scope (fiedlname, fieldvalue)] find if possible a contiguous repetition of records find layout level for record completion extract data/record -inference if IEOnto """ # self.bDebug = False # table.buildNDARRAY() if lTemplate is not None: # convert string to tableTemplateObject template = tableTemplateClass() template.buildFromPattern(lTemplate) template.labelTable(table) else: return None # firstNameColIndex =self.findNameColumn(table) # create a batch for the full page #tag fields with template for cell in table.getCells(): if cell.getFields() != []: if self.bDebug: print(table.getPage(), cell.getIndex(), cell.getFields(), cell.getContent()) for field in cell.getFields(): if field is not None: res = field.applyTaggers(cell) # res [ (token,label,score) ...] extractedValues = field.extractLabel(res) if extractedValues != []: # extractedValues = map(lambda offset,value,label,score:(value,score),extractedValues) extractedValues = list( map(lambda x: (x[1], x[3]), extractedValues)) field.setOffset(res[0]) field.setValue(extractedValues) # field.addValue(extractedValues) if self.bDebug: print('found:', field, field.getValue()) ### now at record level ? ### scope = propagation using only docObject (hardcoded ?) ### where to put the propagation mechanism? # myRecord.propagate(table) ## 'backpropagation: select the rows, and collection subobjects with fields (cells) for row in table.getRows(): #if not row.isHeaders(): myRecord.addCandidate(row) # #for each cell: take the record and # ### FR NOW: TAKE THE FIRST COLUMN # firstCol = table.getColumns()[0] # for cell in firstCol.getCells(): # myRecord.addCandidate(cell) myRecord.rankCandidates() lcand = myRecord.getCandidates() # print lcand # myRecord.display() def mergeLineAndCells(self, lPages): """ assign lines(TEXT) to cells """ for page in lPages: lLines = page.getAllNamedObjects(XMLDSTEXTClass) lCells = page.getAllNamedObjects(XMLDSTABLECELLClass) dAssign = {} for line in lLines: bestscore = 0.0 for cell in lCells: ratio = line.ratioOverlap(cell) if ratio > bestscore: bestscore = ratio dAssign[line] = cell [dAssign[line].addObject(line) for line in dAssign.keys()] [cell.getObjects().sort(key=lambda x: x.getY()) for cell in lCells] def testGTText(self, page): """ extract region text and parse it """ from contentProcessing.taggerTrainKeras import DeepTagger myTagger = DeepTagger() myTagger.bPredict = True myTagger.sModelName = '2mix_cm' myTagger.dirName = 'IEdata/model/' myTagger.loadModels() for region in page.getObjects(): # print region.getContent().encode('utf-8') res = myTagger.predict([region.getContent()]) try: res = myTagger.predict([region.getContent()]) # print res except: print('SENT WITH ISSUES : [%s]' % (region.getContent().encode('utf-8'))) def mineTable(self, tabel, dr): """ from the current HTR: find the categories in each column (once NER applied) """ def selectTemplat(self, lTemplates): """ if a list of templates is available: take a couple pages and perform IE: simply sum up the score of the tagger """ def processWithTemplate(self, table, dr): """ according to the # of columns, apply the corresponding template """ # selection of the dictionaries per columns # template 5,10: first col = numbering # find calibration column: abp_names table.buildNDARRAY() # print (self.findNameColumn(table)) # lTemplateIE2 = [ # ((slice(1,None),slice(0,1)) ,[ 'numbering'],[ dr.getFieldByName('numbering') ]) # , ((slice(1,None),slice(1,2)) ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname'),dr.getFieldByName('religion') ]) # , ((slice(1,None),slice(2,3)) ,[ 'abp_profession','religion' ] ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ]) # , ((slice(1,None),slice(3,4)) ,[ 'abp_location' ] ,[ dr.getFieldByName('location') ]) # , ((slice(1,None),slice(4,5)) ,[ 'abp_family' ] ,[ dr.getFieldByName('situation') ]) # ,((slice(1,None),slice(5,6)) ,[ 'deathreason','artz'] ,[ dr.getFieldByName('deathreason'),dr.getFieldByName('doktor')]) # , ((slice(1,None),slice(6,7)) ,[] , [ ]) #binding # , ((slice(1,None),slice(7,8)) ,['abp_dates', 'abp_dates' ,'abp_year'] ,[,dr.getFieldByName('deathDate'),dr.getFieldByName('deathYear') ]) # , ((slice(1,None),slice(8,9)) ,[ 'abp_dates','abp_location' ] ,[ dr.getFieldByName('burialDate'),dr.getFieldByName('burialLocation') ]) # , ((slice(1,None),slice(9,10)) ,[ 'abp_age','abp_ageunit'] ,[ dr.getFieldByName('age'), dr.getFieldByName('ageUnit')]) # # , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')]) # # , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')]) # ] #fuzzy lTemplateIECAL = [ ((slice(1, None), slice(0, 4)), ['abp_names', 'names_aux', 'numbering', 'religion'], [ dr.getFieldByName('lastname'), dr.getFieldByName('firstname'), dr.getFieldByName('religion') ]), ((slice(1, None), slice(1, 4)), ['abp_profession', 'religion'], [dr.getFieldByName('occupation'), dr.getFieldByName('religion')]) ] #detect empty left columns ? template = tableTemplateClass() template.buildFromPattern(lTemplateIECAL) template.labelTable(table) iRef = self.findNameColumn(table) lTemplateIE = [ ((slice(1, None), slice(iRef, iRef + 1)), ['abp_names', 'names_aux', 'numbering', 'religion'], [ dr.getFieldByName('lastname'), dr.getFieldByName('firstname'), dr.getFieldByName('religion') ]), ((slice(1, None), slice(iRef + 1, iRef + 2)), ['abp_profession', 'religion'], [dr.getFieldByName('occupation'), dr.getFieldByName('religion')]), ((slice(1, None), slice(iRef + 2, iRef + 3)), ['abp_location'], [dr.getFieldByName('location')]), ((slice(1, None), slice(iRef + 3, iRef + 4)), ['abp_family'], [dr.getFieldByName('situation')]) #[] binding , ((slice(1, None), slice(iRef + 4, iRef + 6)), ['abp_deathreason', 'artz'], [dr.getFieldByName('deathreason'), dr.getFieldByName('doktor')]), ((slice(1, None), slice(iRef + 5, iRef + 7)), ['abp_dates', 'abp_year'], [ dr.getFieldByName('MonthDayDateGenerator'), dr.getFieldByName('deathDate'), dr.getFieldByName('deathYear') ]), ((slice(1, None), slice(iRef + 6, iRef + 8)), ['abp_dates', 'abp_year', 'abp_location'], [ dr.getFieldByName('burialDate'), dr.getFieldByName('deathYear'), dr.getFieldByName('burialLocation') ]), ((slice(1, None), slice(iRef + 8, iRef + 10)), ['abp_age', 'abp_ageunit'], [dr.getFieldByName('age'), dr.getFieldByName('ageUnit')]) # , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')]) # , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')]) ] # recalibrate template # # lTemplate = lTemplateIE # if table.getNbColumns() >= 12: # lTemplate = lTemplateIE2 # else: # lTemplate = lTemplateIE self.extractData(table, dr, lTemplateIE) # select best solutions # store inthe proper final format return dr def run(self, doc): """ main issue: how to select the template: to be done by CVL assuming IE and htr info are stored in the template """ # self.firstPage = 9 # self.lastPage= 9 if self.page2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() dr = deathRecord(self.sModelName, self.sModelDir) ## selection of the templates first with X tables ### for page in self.lPages: print("page: ", page.getNumber()) # self.testGTText(page) # continue lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: if table.getNbRows() < 2: print("page: %s : not a table? %d/%d" % (page.getNumber(), table.getNbRows(), table.getNbColumns())) continue if self.BuseStoredTemplate: # self.processWithTemplate(table, dr) try: self.processWithTemplate(table, dr) except: print('issue with page %s' % page) else: self.mineTable(table, dr) self.evalData = dr.generateOutput(self.evalData) # print self.evalData.serialize('utf-8',True) def generateTestOutput(self): """ <PAGE number="1" pagenum="1" nbrecords="10" years="0"> <RECORD lastname="Riesinger" firstname="Korona" role="Verstorbener" location="Neuhofen" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Ringseisen" firstname="Georg" role="Verstorbener" location="Etzing" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Nebel" firstname="Theresia" role="Verstorbener" location="Sandbach" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Schlögl" firstname="Cäcilia" role="Verstorbener" location="Stampfing" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Riedinger" firstname="Theresia" role="Verstorbener" location="Lapperding" occupation="Austragsbäuerin" year="0" month="0" day="0"/> <RECORD lastname="Wührer" firstname="Joseph" role="Verstorbener" location="Haizing" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Wilmerdinger" firstname="Theresia" role="Verstorbener" location="Hausmanning" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Ratzinger" firstname="Mathias" role="Verstorbener" location="Kafferding" occupation="Bauer" year="0" month="0" day="0"/> <RECORD lastname="Deixelberger" firstname="Joseph" role="Verstorbener" location="Gaishofen" occupation="Inwohner" year="0" month="0" day="0"/> <RECORD lastname="Beham" firstname="Martin" role="Verstorbener" location="Socking" occupation="Austragsbauer" year="0" month="0" day="0"/> </PAGE> """ root = etree.Element('DOCUMENT') self.evalData = etree.ElementTree(root) for page in lPages: domp = etree.Elemen('PAGE') domp.set('number', page.getAttribute('number')) domp.set('pagenum', os.path.basename(page.getAttribute('imageFilename'))[:-4]) root.append(domp) domp.set('template', page.getNode().prop('template')) domp.set('reftemplate', page.getNode().prop('reftemplate')) return self.evalData def testFirstNameLastNameRecord(self, srefData, srunData, bVisual): """ test firstname in record group by page """ cntOk = cntErr = cntMissed = 0 # srefData = srefData.decode('utf-8') #.strip("\n") RefData = etree.XML(srefData.strip("\n").encode('utf-8')) RunData = etree.XML(srunData.strip("\n").encode('utf-8')) lRef = [] lPages = RefData.xpath('//%s' % ('PAGE[@number]')) lRefKeys = {} for page in lPages: pnum = page.get('number') key = page.get('pagenum') lRefKeys[key] = 1 xpath = "./%s" % ("RECORD") lrecord = page.xpath(xpath) if len(lrecord) == 0: lRef.append([]) else: for record in lrecord: xpath = "./%s" % ("./@firstname") lf = record.xpath(xpath) xpath = "./%s" % ("./@lastname") ln = record.xpath(xpath) if len(lf) > 0: lRef.append((pnum, key, lf[0], ln[0])) lPageMapping = {} lRun = [] if RunData is not None: lpages = RunData.xpath('//%s' % ('PAGE[@number]')) for page in lpages: pnum = page.get('number') key = page.get('pagenum') # key= page.get('number') if key in lRefKeys.keys(): lPageMapping[key] = pnum #record level! xpath = "./%s" % ("RECORD[@firstname and @lastname]") lrecord = page.xpath(xpath) if len(lrecord) == 0: pass else: for record in lrecord: xpath = "./%s" % ("./@firstname") lf = record.xpath(xpath) xpath = "./%s" % ("./@lastname") ln = record.xpath(xpath) if len( lf ) > 0: # and lf[0].getContent() != ln[0].getContent(): lRun.append((pnum, key, lf[0], ln[0])) ltisRefsRunbErrbMiss = list() for key in lRunPerPage: # for key in ['Neuoetting_009_05_0150']: lRun = lRunPerPage[key] lRef = lRefKeys[key] runLen = len(lRunPerPage[key]) refLen = len(lRefKeys[key]) bT = False if refLen <= runLen: rows = lRef cols = lRun else: rows = lRun cols = lRef bT = True cost_matrix = np.zeros((len(rows), len(cols)), dtype=float) for a, i in enumerate(rows): curRef = i for b, j in enumerate(cols): runElt = j ret, val = self.testCompareRecordField(curRef, runElt) val /= 100 if val == 0: dist = 10 else: dist = 1 / val cost_matrix[a, b] = dist m = linear_sum_assignment(cost_matrix) r1, r2 = m # print (bT,r1,r2) # print (list(x[2] for x in rows)) # print (list(x[2] for x in cols)) lcsTH = self.lcsTH / 100 lCovered = [] for a, i in enumerate(r2): # print (key,a,r1[a],i,rows[r1[a]][2],cols[i][2], 1/cost_matrix[r1[a],i]) if 1 / cost_matrix[r1[a], i] > lcsTH: cntOk += 1 if bT: ltisRefsRunbErrbMiss.append( (runElt[1], int(runElt[0]), cols[i], rows[r1[a]], False, False)) else: ltisRefsRunbErrbMiss.append( (runElt[1], int(runElt[0]), rows[r1[a]], cols[i], False, False)) else: #too distant: false if bT: lCovered.append(i) ltisRefsRunbErrbMiss.append( (runElt[1], int(runElt[0]), "", rows[r1[a]], True, False)) else: lCovered.append(r1[a]) ltisRefsRunbErrbMiss.append((runElt[1], int(runElt[0]), "", cols[i], True, False)) cntErr += 1 for iref in r1: if iref not in r2: ltisRefsRunbErrbMiss.append((runElt[1], int(runElt[0]), lRef[iref], '', False, True)) cntMissed += 1 for iref in lCovered: ltisRefsRunbErrbMiss.append( (runElt[1], int(runElt[0]), lRef[iref], '', False, True)) cntMissed += 1 ltisRefsRunbErrbMiss.sort(key=lambda x: x[0]) # runLen = len(lRun) # refLen = len(lRef) # # bVisual = True # ltisRefsRunbErrbMiss= list() # lRefCovered = [] # for i in range(0,len(lRun)): # iRef = 0 # bFound = False # bErr , bMiss= False, False # runElt = lRun[i] # # print '\t\t===',runElt # while not bFound and iRef <= refLen - 1: # curRef = lRef[iRef] # if runElt and curRef not in lRefCovered and self.testCompareRecordFirstNameLastName(curRef,runElt): # bFound = True # lRefCovered.append(curRef) # iRef+=1 # if bFound: # if bVisual:print("FOUND:", runElt, ' -- ', lRefCovered[-1]) # cntOk += 1 # else: # curRef='' # cntErr += 1 # bErr = True # if bVisual:print("ERROR:", runElt) # if bFound or bErr: # ltisRefsRunbErrbMiss.append( (runElt[1],int(runElt[0]), curRef, runElt,bErr, bMiss) ) # for i,curRef in enumerate(lRef): # if curRef not in lRefCovered: # if bVisual:print("MISSED:", curRef) # ltisRefsRunbErrbMiss.append( (curRef[1],int(lPageMapping[curRef[1]]), curRef, '',False, True) ) # cntMissed+=1 # # ltisRefsRunbErrbMiss.sort(key=lambda x:x[0]) return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss) def testRecordField(self, lfieldName, lfieldInRef, srefData, srunData, bVisual): """ test fieldName in record """ assert len(lfieldName) == len((lfieldInRef)) for i, f in enumerate(lfieldName): if lfieldInRef[i] is None: lfieldInRef[i] = f cntOk = cntErr = cntMissed = 0 # srefData = srefData.decode('utf-8') #.strip("\n") RefData = etree.XML(srefData.strip("\n").encode('utf-8')) RunData = etree.XML(srunData.strip("\n").encode('utf-8')) lPages = RefData.xpath('//%s' % ('PAGE[@number]')) lRefKeys = {} for page in lPages: pnum = page.get('number') key = page.get('pagenum') xpath = "./%s" % ("RECORD") lrecord = page.xpath(xpath) if len(lrecord) == 0: lRef.append([]) else: for record in lrecord: lf = [] for fieldInRef in lfieldInRef: xpath = "./%s" % ("./@%s" % fieldInRef) ln = record.xpath(xpath) if ln and len(ln[0]) > 0: lf.append(ln[0]) if lf != []: try: # if (pnum,key,lf) in lRefKeys[key]: # print ('duplicated',(pnum,key,lf)) # else: # lRefKeys[key].append((pnum,key,lf)) lRefKeys[key].append((pnum, key, lf)) except KeyError: lRefKeys[key] = [(pnum, key, lf)] lRunPerPage = {} lPageMapping = {} if RunData: lpages = RunData.xpath('//%s' % ('PAGE[@number]')) for page in lpages: pnum = page.get('number') key = page.get('pagenum') lPageMapping[key] = pnum if key in lRefKeys: #record level! xpath = "./%s" % ("RECORD") lrecord = page.xpath(xpath) if len(lrecord) == 0: pass # lRun.append([]) else: for record in lrecord: lf = [] for fieldName in lfieldName: xpath = "./%s" % ("./@%s" % fieldName) ln = record.xpath(xpath) if len(ln) > 0 and len(ln[0]) > 0: lf.append(ln[0]) if len(lf) == len(lfieldName): try: lRunPerPage[key].append((pnum, key, lf)) except KeyError: lRunPerPage[key] = [(pnum, key, lf)] ltisRefsRunbErrbMiss = list() for key in lRunPerPage: # for key in ['Neuoetting_008_03_0032']: lRun = lRunPerPage[key] lRef = lRefKeys[key] runLen = len(lRunPerPage[key]) refLen = len(lRefKeys[key]) bT = False if refLen <= runLen: rows = lRef cols = lRun else: rows = lRun cols = lRef bT = True cost_matrix = np.zeros((len(rows), len(cols)), dtype=float) for a, i in enumerate(rows): curRef = i for b, j in enumerate(cols): runElt = j ret, val = self.testCompareRecordField(curRef, runElt) dist = 100 - val cost_matrix[a, b] = dist # print (curRef,runElt,val,dist) m = linear_sum_assignment(cost_matrix) r1, r2 = m if False: print(len(lRef), lRef) print(len(lRun), lRun) print(bT, r1, r2) lcsTH = self.lcsTH lCovered = [] lMatched = [] for a, i in enumerate(r2): # print (key,a,r1[a],i,rows[r1[a]][2],cols[i][2], 100-cost_matrix[r1[a],i]) if 100 - cost_matrix[r1[a, ], i] > lcsTH: cntOk += 1 if bT: ltisRefsRunbErrbMiss.append( (rows[r1[a]][1], int(rows[r1[a]][0]), cols[i][2], rows[r1[a]][2], False, False)) lMatched.append(i) else: ltisRefsRunbErrbMiss.append( (cols[i][1], int(cols[i][0]), rows[r1[a]][2], cols[i][2], False, False)) lMatched.append(r1[a]) else: #too distant: false if bT: lCovered.append(i) ltisRefsRunbErrbMiss.append( (rows[r1[a]][1], int(rows[r1[a]][0]), "", rows[r1[a]][2], True, False)) else: lCovered.append(r1[a]) ltisRefsRunbErrbMiss.append( (cols[i][1], int(cols[i][0]), "", cols[i][2], True, False)) cntErr += 1 # print ('matched',lMatched) for i, iref in enumerate(lRef): if i not in lMatched: # print ('not mathced',i,iref) ltisRefsRunbErrbMiss.append( (lRef[i][1], int(lPageMapping[lRef[i][1]]), lRef[i][2], '', False, True)) cntMissed += 1 # else:print('machtg!',i,lRef[i]) ltisRefsRunbErrbMiss.sort(key=lambda x: x[0]) # for x in ltisRefsRunbErrbMiss: # print (x) return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss) def testCompareRecordFirstNameLastName(self, refdata, rundata, bVisual=False): if refdata[1] != rundata[1]: return False refall = refdata[2].lower() + refdata[3].lower() reflen = len(refdata[2]) + len(refdata[3]) runall = rundata[2].lower() + rundata[3].lower() runlen = len(rundata[2]) + len(rundata[3]) runall.replace('n̄', 'nn') runall.replace('m̄', 'mm') return matchLCS(0, (refall, reflen), (runall, runlen)) def testCompareRecordField(self, refdata, rundata, bVisual=False): # same page !! if refdata[1] != rundata[1]: return False, 0 if rundata[2] == []: return False, 0 if refdata[2] == []: return False, 0 runall = " ".join(rundata[2]).strip().lower() refall = " ".join(refdata[2]).strip().lower() return matchLCS(0, (refall, len(refall)), (runall, len(runall))) return res, val def testCompareFullRecord(self, refdata, rundata, bVisual=False): bOK = True for i, attr in enumerate(refdata): bOK = bOK and refdata[i] == rundata[i] return bOK ################ TEST ################## def createFakeData(self): """ for testing purpose <PAGE number="1" pagenum="1" nbrecords="10" years="0"> <RECORD lastname="Riesinger" firstname="Korona" role="Verstorbener" location="Neuhofen" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Ringseisen" firstname="Georg" role="Verstorbener" location="Etzing" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Nebel" firstname="Theresia" role="Verstorbener" location="Sandbach" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Schlögl" firstname="Cäcilia" role="Verstorbener" location="Stampfing" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Riedinger" firstname="Theresia" role="Verstorbener" location="Lapperding" occupation="Austragsbäuerin" year="0" month="0" day="0"/> <RECORD lastname="Wührer" firstname="Joseph" role="Verstorbener" location="Haizing" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Wilmerdinger" firstname="Theresia" role="Verstorbener" location="Hausmanning" occupation="" year="0" month="0" day="0"/> <RECORD lastname="Ratzinger" firstname="Mathias" role="Verstorbener" location="Kafferding" occupation="Bauer" year="0" month="0" day="0"/> <RECORD lastname="Deixelberger" firstname="Joseph" role="Verstorbener" location="Gaishofen" occupation="Inwohner" year="0" month="0" day="0"/> <RECORD lastname="Beham" firstname="Martin" role="Verstorbener" location="Socking" occupation="Austragsbauer" year="0" month="0" day="0"/> </PAGE> """ self.evalData = libxml2.newDoc('1.0') root = libxml2.newNode('DOCUMENT') self.evalData.setRootElement(root) domp = libxml2.newNode('PAGE') domp.setProp('number', '182') domp.setProp('nbrecords', 'None') domp.setProp('years', '1876;1877') root.addChild(domp) record = libxml2.newNode('RECORD') domp.addChild(record) record.setProp('lastname', 'Riedinger') record.setProp('firstname', 'Theresia') print( etree.tostring(self.evalData, encoding='unicode', pretty_print=True)) return self.evalData def testRun(self, filename, outFile=None): """ testRun is responsible for running the component on this file and returning a string that reflects the result in a way that is understandable to a human and to a program. Nicely serialized Python data or XML is fine """ self.evalData = None doc = self.loadDom(filename) self.run(doc) # self.generateTestOutput() # self.createFakeData() if outFile: self.writeDom(doc) # return unicode return etree.tostring(self.evalData, encoding='unicode', pretty_print=True) def testCompare(self, srefData, srunData, bVisual=False): """ Our comparison is very simple: same or different. N We anyway return this in term of precision/recall If we want to compute the error differently, we must define out own testInit testRecord, testReport """ dicTestByTask = dict() # dicTestByTask['Names']= self.testFirstNameLastNameRecord(srefData, srunData,bVisual) dicTestByTask['lastname'] = self.testRecordField(['lastname'], [None], srefData, srunData, bVisual) dicTestByTask['firstname'] = self.testRecordField(['firstname'], [None], srefData, srunData, bVisual) dicTestByTask['occupation'] = self.testRecordField(['occupation'], [None], srefData, srunData, bVisual) dicTestByTask['location'] = self.testRecordField(['location'], [None], srefData, srunData, bVisual) dicTestByTask['deathreason'] = self.testRecordField(['deathreason'], [None], srefData, srunData, bVisual) dicTestByTask['names'] = self.testRecordField( ['firstname', 'lastname'], [None, None], srefData, srunData, bVisual) dicTestByTask['doktor'] = self.testRecordField(['doktor'], ['helfer_name'], srefData, srunData, bVisual) # dicTestByTask['namedeathlocationoccupation']= self.testRecordField(['firstname','lastname','deathreason','location','occupation'],[None,None,None,None,None],srefData, srunData,bVisual) dicTestByTask['situation'] = self.testRecordField(['situation'], ['family'], srefData, srunData, bVisual) # dicTestByTask['Year']= self.testYear(srefData, srunData,bVisual) return dicTestByTask def testRecordHtml(self, filename, data, nOk, nErr, nMiss): if nOk == None: assert nErr == None and nMiss == None, "INTERNAL ERROR" #we are reporting on multiple tasks!! lltisRefsRunbErrbMiss = data #this is a list of (taskName, nOk, nErr, nMiss, ltisRefsRunbErrbMiss) else: lltisRefsRunbErrbMiss = [(None, nOk, nErr, nMiss, data)] #let's produce an HTML report!! sCollecDir = os.path.dirname(self.testDirXML) sCollec = os.path.basename(sCollecDir) sFile = os.path.basename(self.getRefFileName(filename))[:-4] sViewBaseUrl = "http://" #+ sHttpHost fHtml = open(self.getHtmRunFileName(filename), "w", encoding='utf-8') sCss = """ <style type="text/css"> .OK { color: green; } .Error { color: red; } .Error\+Miss { color: darkred; } .Miss { color: orange; } </style> """ sRpt = self.makeHTMLReportHeader(sViewBaseUrl, "dla_pdf", sCss, sCollec + " - " + sFile, sCollec + " - " + sFile) fHtml.write(sRpt) #sRpt += " Doc Prec. Recall F1\t nOk\t nErr\t nMiss\tFilename\n" for taskName, nOk, nErr, nMiss, ltisRefsRunbErrbMiss in lltisRefsRunbErrbMiss: if taskName == None: taskName = "" sRpt = """ <hr/> <h2>%s</h2> <table> <tr align="left"> <th></th> <th>Page</th> <th>Reference</th> <th>Run</th> <th></th> </tr> """ % taskName fHtml.write(sRpt) ipnum_prev = -1 key_prev = -1 for (key, ipnum, sRef, sRun, bErr, bMiss) in ltisRefsRunbErrbMiss: if bErr and bMiss: sRptType = "Error+Miss" else: if bErr: sRptType = "Error" elif bMiss: sRptType = "Miss" else: sRptType = "OK" sPfFile = sCollec + "/" + sFile + "/" + "pf%06d" % ipnum srefenc = " ".join(x for x in sRef) srun = " ".join(x for x in sRun) # if ipnum > ipnum_prev: #a new page if key != key_prev: fHtml.write('<tr ><td>%s (%s)</td></tr>' % (key, ipnum)) fHtml.write( '<tr class="%s"><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % ( sRptType, sRptType, "" #ipnum , srefenc #sRef , srun #sRun # , " - ".join(lsViews) )) else: #some more results for the same pafe fHtml.write( '<tr class="%s"><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % (sRptType, sRptType, "", srefenc, srun, "")) ipnum_prev = ipnum key_prev = key fHtml.write('</table>') fHtml.write('<p/>') fHtml.write( self.genHtmlTableReport(sCollec, None, [(filename, nOk, nErr, nMiss, None)])) fHtml.write('<hr>') fHtml.close() return
class columnDetection(Component.Component): """ build table column """ usage = "" version = "v.01" description = "description: column Detection" #--- INIT ------------------------------------------------------------------------------------------------------------- def __init__(self): """ Always call first the Component constructor. """ Component.Component.__init__(self, "columnDetection", self.usage, self.version, self.description) self.colname = None self.docid = None self.do2DS = False self.THHighSupport = 0.75 # for --test self.bCreateRef = False self.evalData = None def setParams(self, dParams): """ Always call first the Component setParams Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value) """ Component.Component.setParams(self, dParams) # if dParams.has_key("coldir"): # self.colname = dParams["coldir"] if "docid" in dParams: self.docid = dParams["docid"] if "dsconv" in dParams: self.do2DS = dParams["dsconv"] self.bMining = dParams["mining"] if "createref" in dParams: self.bCreateRef = dParams["createref"] def createTable(self, page): """ BB of all elements? todo: Ignore O! """ x1, y1, x2, y2 = self.getBounbingBox(page) if x1 is None: return None myTable = XMLDSTABLEClass() myTable.setX(x1) myTable.setY(y1) myTable.setWidth(x2 - x1) myTable.setHeight(y2 - y1) page.addObject(myTable) return myTable def processPage(self, page, emptyTable): from util.XYcut import mergeSegments ### skrinking to be done: use center ? # lCuts, _, _ = mergeSegments([(x.getX(),x.getX2(),x) for x in page.getAllNamedObjects(XMLDSTEXTClass)],0) lCuts, _, _ = mergeSegments( [(x.getX(), x.getX() + 0.25 * x.getWidth(), x) for x in page.getAllNamedObjects(XMLDSTEXTClass)], 0) # lCuts, _, _ = mergeSegments([(x.getX()+0.5*x.getWidth()-0.25*x.getWidth(),x.getX()+0.5*x.getWidth()+0.25*x.getWidth(),x) for x in page.getAllNamedObjects(XMLDSTEXTClass)],0) for i, (x, _, cut) in enumerate(lCuts): ll = list(cut) ll.sort(key=lambda x: x.getY()) #add column myCol = XMLDSTABLECOLUMNClass(i) myCol.setPage(page) myCol.setParent(emptyTable) emptyTable.addObject(myCol) myCol.setX(x) myCol.setY(emptyTable.getY()) myCol.setHeight(emptyTable.getHeight()) if i + 1 < len(lCuts): myCol.setWidth(lCuts[i + 1][0] - x) else: # use table myCol.setWidth(emptyTable.getX2() - x) emptyTable.addColumn(myCol) if not self.bMining: myCol.tagMe(ds_xml.sCOL) def getBounbingBox(self, page): lElts = page.getAllNamedObjects(XMLDSTEXTClass) if lElts == []: return None, None, None, None lX1, lX2, lY1, lY2 = zip(*[(x.getX(), x.getX2(), x.getY(), x.getY2()) for x in lElts]) return min(lX1), min(lY1), max(lX2), max(lY2) def findColumnsInDoc(self, lPages): """ find columns for each table in ODoc """ for page in lPages: traceln("page: %d" % page.getNumber()) lch, lcv = self.mergeHorVerClusters(page) # table = self.createTable(page) # if table is not None: # # table.tagMe(ds_xml.sTABLE) # self.processPage(page,table) def createContourFromListOfElements(self, lElts): """ create a polyline from a list of elements input : list of elements output: Polygon object """ from shapely.geometry import Polygon as pp from shapely.ops import cascaded_union lP = [] for elt in lElts: sPoints = elt.getAttribute('points') if sPoints is None: lP.append( pp([(elt.getX(), elt.getY()), (elt.getX(), elt.getY2()), (elt.getX2(), elt.getY2()), (elt.getX2(), elt.getY())])) else: lP.append( pp([(float(x), float(y)) for x, y in zip(*[iter(sPoints.split(','))] * 2)])) try: ss = cascaded_union(lP) except ValueError: print(lElts, lP) return None return ss #list(ss.convex_hull.exterior.coords) def mergeHorVerClusters(self, page): """ build Horizontal and vertical clusters """ from util import TwoDNeighbourhood as TwoDRel lTexts = page.getAllNamedObjects(XMLDSTEXTClass) for e in lTexts: e.lright = [] e.lleft = [] e.ltop = [] e.lbottom = [] lVEdge = TwoDRel.findVerticalNeighborEdges(lTexts) for a, b in lVEdge: a.lbottom.append(b) b.ltop.append(a) for elt in lTexts: # dirty! elt.setHeight(max(5, elt.getHeight() - 3)) elt.setWidth(max(5, elt.getWidth() - 3)) TwoDRel.rotateMinus90degOLD(elt) lHEdge = TwoDRel.findVerticalNeighborEdges(lTexts) for elt in lTexts: # elt.tagMe() TwoDRel.rotatePlus90degOLD(elt) # return for a, b in lHEdge: a.lright.append(b) b.lleft.append(a) # ss for elt in lTexts: elt.lleft.sort(key=lambda x: x.getX(), reverse=True) # elt.lright.sort(key = lambda x:x.getX()) if len(elt.lright) > 1: elt.lright = [] elt.lright.sort(key=lambda x: elt.signedRatioOverlapY(x), reverse=True) # print (elt, elt.getY(), elt.lright) elt.ltop.sort(key=lambda x: x.getY()) if len(elt.lbottom) > 1: elt.lbottom = [] elt.lbottom.sort(key=lambda x: elt.signedRatioOverlapX(x), reverse=True) lHClusters = [] # Horizontal lTexts.sort(key=lambda x: x.getX()) lcovered = [] for text in lTexts: if text not in lcovered: # print ('START :', text, text.getContent()) lcovered.append(text) lcurRow = [text] curText = text while curText is not None: try: nextT = curText.lright[0] # print ('\t',[(x,curText.signedRatioOverlapY(x)) for x in curText.lright]) if nextT not in lcovered: lcurRow.append(nextT) lcovered.append(nextT) curText = nextT except IndexError: curText = None # lHClusters.append(lcurRow) # print ("FINAL", list(map(lambda x:(x,x.getContent()),lcurRow)) ) if len(lcurRow) > 0: # # create a contour for visualization # # order by col: get top and bottom polylines for them contour = self.createContourFromListOfElements(lcurRow) lHClusters.append((lcurRow, contour)) # Vertical lVClusters = [] lTexts.sort(key=lambda x: x.getY()) lcovered = [] for text in lTexts: if text not in lcovered: # print ('START :', text, text.getContent()) lcovered.append(text) lcurCol = [text] curText = text while curText is not None: try: nextT = curText.lbottom[0] # print ('\t',[(x,curText.signedRatioOverlapY(x)) for x in curText.lright]) if nextT not in lcovered and len(nextT.lbottom) == 1: lcurCol.append(nextT) lcovered.append(nextT) curText = nextT except IndexError: curText = None # # print ("FINAL", list(map(lambda x:(x,x.getContent()),lcurCol)) ) if len(lcurCol) > 0: contour = self.createContourFromListOfElements(lcurCol) lVClusters.append((lcurCol, contour)) if contour: # print (contour.bounds) r = XMLDSObjectClass() r.setName('cc') r.setParent(page) # r.addAttribute('points',spoints) x1, y1, x2, y2 = contour.bounds r.setXYHW(x1, y1, y2 - y1, x2 - x1) page.addObject(r) # r.tagMe('BLOCK') print(page.getAllNamedObjects('cc')) return lHClusters, lVClusters def documentMining(self, lPages): """ need to clean up REGION nodes """ seqMiner = tableColumnMiner() seqMiner.columnMining(lPages, self.THHighSupport, sTag=ds_xml.sCOL) def checkInputFormat(self, lPages): """ delete regions : copy regions elements at page object unlink subnodes """ for page in lPages: lRegions = page.getAllNamedObjects("REGION") lElts = [] [lElts.extend(x.getObjects()) for x in lRegions] [page.addObject(x, bDom=True) for x in lElts] [page.removeObject(x, bDom=True) for x in lRegions] def run(self, doc): """ load dom and find rows """ # conver to DS if needed if self.bCreateRef: if self.do2DS: dsconv = primaAnalysis() doc = dsconv.convert2DS(doc, self.docid) refdoc = self.createRef(doc) return refdoc # single ref per page refdoc = self.createRefPerPage(doc) return None if self.do2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() self.checkInputFormat(self.lPages) self.findColumnsInDoc(self.lPages) if self.bMining: self.documentMining(self.lPages) if self.bCreateRef: refdoc = self.createRef(self.doc) return refdoc # if self.do2DS: # # bakc to PageXml # conv= DS2PageXMLConvertor() # lPageXDoc = conv.run(self.doc) # conv.storeMultiPageXml(lPageXDoc,self.getOutputFileName()) # print self.getOutputFileName() # return None return self.doc ################ TEST ################## def testRun(self, filename, outFile=None): """ evaluate using ABP new table dataset with tablecell """ self.evalData = None doc = self.loadDom(filename) doc = self.run(doc) self.evalData = self.createRef(doc) if outFile: self.writeDom(doc) # return self.evalData.serialize('utf-8',1) return etree.tostring(self.evalData, encoding='unicode', pretty_print=True) def overlapX(self, zone): [a1, a2] = self.getX(), self.getX() + self.getWidth() [b1, b2] = zone.getX(), zone.getX() + zone.getWidth() return min(a2, b2) >= max(a1, b1) def overlapY(self, zone): [a1, a2] = self.getY(), self.getY() + self.getHeight() [b1, b2] = zone.getY(), zone.getY() + zone.getHeight() return min(a2, b2) >= max(a1, b1) def signedRatioOverlap(self, z1, z2): """ overlap self and zone return surface of self in zone """ [x1, y1, h1, w1] = z1.getX(), z1.getY(), z1.getHeight(), z1.getWidth() [x2, y2, h2, w2] = z2.getX(), z2.getY(), z2.getHeight(), z2.getWidth() fOverlap = 0.0 if self.overlapX(z2) and self.overlapY(z2): [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1] [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2] s1 = w1 * h1 # possible ? if s1 == 0: s1 = 1.0 #intersection nx1 = max(x11, x21) nx2 = min(x12, x22) ny1 = max(y11, y21) ny2 = min(y12, y22) h = abs(nx2 - nx1) w = abs(ny2 - ny1) inter = h * w if inter > 0: fOverlap = inter / s1 else: # if overX and Y this is not possible ! fOverlap = 0.0 return fOverlap def findSignificantOverlap(self, TOverlap, ref, run): """ return """ pref, rowref = ref prun, rowrun = run if pref != prun: return False return rowref.ratioOverlap(rowrun) >= TOverlap def testCPOUM(self, TOverlap, srefData, srunData, bVisual=False): """ TOverlap: Threshols used for comparing two surfaces Correct Detections: under and over segmentation? """ cntOk = cntErr = cntMissed = 0 RefData = etree.XML(srefData.strip("\n").encode('utf-8')) RunData = etree.XML(srunData.strip("\n").encode('utf-8')) # try: # RunData = libxml2.parseMemory(srunData.strip("\n"), len(srunData.strip("\n"))) # except: # RunData = None # return (cntOk, cntErr, cntMissed) lRun = [] if RunData: lpages = RunData.xpath('//%s' % ('PAGE')) for page in lpages: pnum = page.get('number') #record level! lRows = page.xpath(".//%s" % ("ROW")) lORows = map(lambda x: XMLDSTABLECOLUMNClass(0, x), lRows) for row in lORows: row.fromDom(row._domNode) row.setIndex(row.getAttribute('id')) lRun.append((pnum, row)) print(lRun) lRef = [] lPages = RefData.xpath('//%s' % ('PAGE')) for page in lPages: pnum = page.get('number') lRows = page.xpath(".//%s" % ("ROW")) lORows = map(lambda x: XMLDSTABLECOLUMNClass(0, x), lRows) for row in lORows: row.fromDom(row._domNode) row.setIndex(row.getAttribute('id')) lRef.append((pnum, row)) refLen = len(lRef) # bVisual = True ltisRefsRunbErrbMiss = list() lRefCovered = [] for i in range(0, len(lRun)): iRef = 0 bFound = False bErr, bMiss = False, False runElt = lRun[i] # print '\t\t===',runElt while not bFound and iRef <= refLen - 1: curRef = lRef[iRef] if runElt and curRef not in lRefCovered and self.findSignificantOverlap( TOverlap, runElt, curRef): bFound = True lRefCovered.append(curRef) iRef += 1 if bFound: if bVisual: print("FOUND:", runElt, ' -- ', lRefCovered[-1]) cntOk += 1 else: curRef = '' cntErr += 1 bErr = True if bVisual: print("ERROR:", runElt) if bFound or bErr: ltisRefsRunbErrbMiss.append( (int(runElt[0]), curRef, runElt, bErr, bMiss)) for i, curRef in enumerate(lRef): if curRef not in lRefCovered: if bVisual: print("MISSED:", curRef) ltisRefsRunbErrbMiss.append( (int(curRef[0]), curRef, '', False, True)) cntMissed += 1 ltisRefsRunbErrbMiss.sort(key=lambda xyztu: xyztu[0]) # print cntOk, cntErr, cntMissed,ltisRefsRunbErrbMiss return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss) def testCompare(self, srefData, srunData, bVisual=False): """ as in Shahad et al, DAS 2010 Correct Detections Partial Detections Over-Segmented Under-Segmented Missed False Positive """ dicTestByTask = dict() dicTestByTask['T50'] = self.testCPOUM(0.50, srefData, srunData, bVisual) # dicTestByTask['T75']= self.testCPOUM(0.750,srefData,srunData,bVisual) # dicTestByTask['T100']= self.testCPOUM(0.50,srefData,srunData,bVisual) # dicTestByTask['FirstName']= self.testFirstNameRecord(srefData, srunData,bVisual) # dicTestByTask['Year']= self.testYear(srefData, srunData,bVisual) return dicTestByTask def createColumnsWithCuts(self, lXCuts, table, tableNode, bTagDoc=False): """ create column dom node """ prevCut = None lXCuts.sort() for index, cut in enumerate(lXCuts): # first correspond to the table: no rpw if prevCut is not None: colNode = etree.Element("COL") tableNode.append(colNode) colNode.set('x', str(prevCut)) colNode.set('width', "{:.2f}".format(cut - prevCut)) colNode.set('y', str(table.getY())) colNode.set('height', str(table.getHeight())) colNode.set('id', str(index - 1)) prevCut = cut #last cut = table.getX2() colNode = etree.Element("COL") tableNode.append(colNode) colNode.set('x', "{:.2f}".format(prevCut)) colNode.set('width', "{:.2f}".format(cut - prevCut)) colNode.set('y', str(table.getY())) colNode.set('height', str(table.getHeight())) colNode.set('id', str(index)) def createRef(self, doc): """ create a ref file from the xml one """ self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(doc, listPages=range(self.firstPage, self.lastPage + 1)) root = etree.Element("DOCUMENT") refdoc = etree.ElementTree(root) for page in self.ODoc.getPages(): #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0"> pageNode = etree.Element('PAGE') pageNode.set("number", page.getAttribute('number')) pageNode.set("pagekey", os.path.basename(page.getAttribute('imageFilename'))) pageNode.set("width", page.getAttribute('width')) pageNode.set("height", page.getAttribute('height')) root.append(pageNode) lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: dCol = {} tableNode = etree.Element('TABLE') tableNode.set("x", table.getAttribute('x')) tableNode.set("y", table.getAttribute('y')) tableNode.set("width", table.getAttribute('width')) tableNode.set("height", table.getAttribute('height')) pageNode.append(tableNode) for cell in table.getAllNamedObjects(XMLDSTABLECELLClass): try: dCol[int(cell.getAttribute("col"))].append(cell) except KeyError: dCol[int(cell.getAttribute("col"))] = [cell] lXcuts = [] for colid in sorted(dCol.keys()): lXcuts.append( min(list(map(lambda x: x.getX(), dCol[colid])))) self.createColumnsWithCuts(lXcuts, table, tableNode) return refdoc
class tableRowMiner(Component.Component): """ tableRowMiner class: a component to mine table to find out horizontal cuts (hence rows) """ #DEFINE the version, usage and description of this particular component usage = "" version = "v.01" description = "description: table row miner " #--- INIT ------------------------------------------------------------------------------------------------------------- def __init__(self): """ Always call first the Component constructor. """ Component.Component.__init__(self, "tableRowMiner", self.usage, self.version, self.description) # TH for comparing numerical features ## need to be fucntion of leading: when leading small THNUMERICAL small, lineHeighr reduced a well? self.THNUMERICAL = 25 # use for evaluation self.THCOMP = 10 self.evalData = None self.THHighSupport = 0.33 self.bManual = False def setParams(self, dParams): """ Always call first the Component setParams Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value) """ Component.Component.setParams(self, dParams) if "pattern" in dParams: self.manualPattern = eval(dParams["pattern"]) self.bManual = True if "thhighsupport" in dParams: self.THHighSupport = dParams["thhighsupport"] * 0.01 def testHighSupport(self, sequences, th): """ compute unigram support """ # from mssp from collections import Counter import itertools sequence_count = len(sequences) flattened_sequences = [ list(set(itertools.chain(*sequence))) for sequence in sequences ] support_counts = dict( Counter(item for flattened_sequence in flattened_sequences for item in flattened_sequence)) actual_supports = { item: support_counts.get(item) / float(sequence_count) for item in support_counts.keys() } # lOneSupport= [k for k,v in actual_supports.iteritems() if v >= 0.5 ] lOneSupport = [k for k, v in actual_supports.items() if v >= th] # print(actual_supports.items() ) # print (th,lOneSupport) return lOneSupport def createFeatureFromValue(self, elt, value, name): feature = featureObject() feature.setName(name) feature.setTH(self.THNUMERICAL) feature.addNode(elt) feature.setObjectName(elt) feature.setValue(float(value)) feature.setType(featureObject.NUMERICAL) return feature def columnMining(self, table, thnum, th, predefinedCuts=[]): """ for a table: take itemset=colmun, item=cell(Y) + separator - test: is a same rowgrid for all pages: row of fixed positions, size separator PREPRO: get the Y of overlaping separators for each col """ if thnum != None: self.THNUMERICAL = thnum lElts = table.getColumns() #getAllNamedObjects(XMLDSTABLECOLUMNClass) for elt in lElts: # how to add separator? # for c in elt.getCells():c.setHeight() elt.resetFeatures() elt.setFeatureFunction(elt.getSetOfListedAttributes, self.THNUMERICAL, lFeatureList=['y'], myLevel=XMLDSTABLECELLClass) ## add predefinedCuts here elt.computeSetofFeatures() for prevFea in predefinedCuts: f = self.createFeatureFromValue(elt, round(prevFea), 'y') elt.addFeature(f) seqGen = sequenceMiner() seqGen.bDebug = False seqGen.setMaxSequenceLength(1) # seqGen.setSDC(0.7) # related to noise level AND STRUCTURES (if many columns) # _ = seqGen.featureGeneration(lElts,2) # more at token level, but at text level: freq=2 seqGen.setObjectLevel(XMLDSTABLECELLClass) for elt in lElts: elt.lFeatureForParsing = elt.getSetofFeatures() elt.lFeatureForParsing.sort(key=lambda x: x.getValue()) # print( elt, elt.lFeatureForParsing) # lSortedFeatures = seqGen.featureGeneration(lElts, 2) # for f in lSortedFeatures: # print ("%s\s%s"%(f, f.getNodes())) lmaxSequence = seqGen.generateItemsets(lElts) # for elt in lElts: # print elt, elt.getCanonicalFeatures() lSeq, _ = seqGen.generateMSPSData(lmaxSequence, lSortedFeatures, mis=0.5) lOneSupport = self.testHighSupport(lSeq, th) lOneSupport.sort(key=lambda x: x.getValue()) return lOneSupport # lTerminalTemplates=[] # lCurList=lElts[:] # lCurList,lTerminalTemplates = self.mineLineFeature(seqGen,lCurList,lTerminalTemplates) # print lTerminalTemplates # return def mainMining(self, lPages): """ mine with incremental length """ import util.TwoDNeighbourhood as TwoDRel lLElts = [[] for i in range(0, len(lPages))] for i, page in enumerate(lPages): lElts = page.getAllNamedObjects( XMLDSTABLECELLClass ) #+page.getAllNamedObjects(XMLDSGRAPHLINEClass) for e in lElts: e.lnext = [] ## filter elements!!! lElts = filter(lambda x: min(x.getHeight(), x.getWidth()) > 10, lElts) lElts = filter(lambda x: x.getHeight() > 10, lElts) lElts.sort(key=lambda x: x.getY()) lLElts[i] = lElts for elt in lElts: TwoDRel.rotateMinus90deg( elt ) #rotate by 90 degrees and look for vertical neighbors :-) lHEdge = TwoDRel.findVerticalNeighborEdges(lElts) for elt in lElts: TwoDRel.rotatePlus90deg(elt) # lVEdge = TwoDRel.findVerticalNeighborEdges(lElts) for a, b in lHEdge: a.lnext.append(b) for i, page, in enumerate(lPages): lElts = lLElts[i] for elt in lElts: elt.setFeatureFunction(elt.getSetOfListedAttributes, self.THNUMERICAL, lFeatureList=['y'], myLevel=XMLDSTABLECELLClass) elt.computeSetofFeatures() # print elt.getSetofFeatures() seqGen = sequenceMiner() seqGen.setMaxSequenceLength(1) seqGen.setSDC( 0.7 ) # related to noise level AND STRUCTURES (if many columns) _ = seqGen.featureGeneration( lElts, 2) # more at token level, but at text level: freq=2 seqGen.setObjectLevel(XMLDSTABLECELLClass) # lKleendPlus = self.getKleenePlusFeatures(lElts) # print lKleendPlus for elt in lElts: elt.lFeatureForParsing = elt.getSetofFeatures() # print elt, elt.lFeatureForParsing # lTerminalTemplates = [] lCurList = lElts[:] lCurList, lTerminalTemplates = self.mineLineFeature( seqGen, lCurList, lTerminalTemplates) # print lTerminalTemplates for mytemplate in lTerminalTemplates: page.addVerticalTemplate(mytemplate) page.addVSeparator(mytemplate, mytemplate.getPattern()) del seqGen # self.tagAsRegion(lPages) def mineLineFeature(self, seqGen, lCurList, lTerminalTemplates): """ get a set of lines and mine them """ seqGen.setMinSequenceLength(1) seqGen.setMaxSequenceLength(1) print('***' * 20) seqGen.bDebug = False for elt in lCurList: if elt.getSetofFeatures() is None: elt.resetFeatures() elt.setFeatureFunction(elt.getSetOfListedAttributes, self.THNUMERICAL, lFeatureList=['virtual'], myLevel=XMLDSTABLECELLClass) elt.computeSetofFeatures() elt.lFeatureForParsing = elt.getSetofFeatures() else: elt.setSequenceOfFeatures(elt.lFeatureForParsing) # print elt, elt.getSetofFeatures() lSortedFeatures = seqGen.featureGeneration(lCurList, 2) for cf in lSortedFeatures: cf.setWeight(len((cf.getNodes()))) lmaxSequence = seqGen.generateItemsets(lCurList) seqGen.bDebug = False lSeq, _ = seqGen.generateMSPSData(lmaxSequence, lSortedFeatures + lTerminalTemplates, mis=0.01) lPatterns = seqGen.miningSequencePrefixScan(lSeq) # lPatterns = seqGen.beginMiningSequences(lSeq,lSortedFeatures,lMIS) if lPatterns is None: return lCurList, lTerminalTemplates lPatterns.sort(key=lambda xy: xy[0], reverse=True) print("List of patterns and their support:") for p, support in lPatterns: if support >= 1: print(p, support) seqGen.THRULES = 0.95 lSeqRules = seqGen.generateSequentialRules(lPatterns) " here store features which are redundant and consider only the core feature" dTemplatesCnd = self.analyzeListOfPatterns(lPatterns, {}) lFullTemplates, lTerminalTemplates, tranprob = seqGen.testTreeKleeneageTemplates( dTemplatesCnd, lCurList, iterMax=40) return lCurList, lTerminalTemplates def selectFinalTemplates(self, lTemplates, transProb, lElts): """ apply viterbi to select best sequence of templates """ import spm.viterbi as viterbi if lTemplates == []: return None def buildObs(lTemplates, lElts): """ build observation prob """ N = len(lTemplates) + 1 obs = np.zeros((N, len(lElts)), dtype=np.float16) + 10e-3 for i, temp in enumerate(lTemplates): for j, elt in enumerate(lElts): # how to dela with virtual nodes try: _, _, score = temp.registration(elt) except: score = 1 if score == -1: score = 0.0 obs[i, j] = score if np.isinf(obs[i, j]): obs[i, j] = 64000 if np.isnan(obs[i, j]): obs[i, j] = 0.0 # print i,j,elt,elt.lX, temp,score #add no-template:-1 return obs / np.amax(obs) N = len(lTemplates) + 1 initialProb = np.ones(N) initialProb = np.reshape(initialProb, (N, 1)) obs = buildObs(lTemplates, lElts) np.set_printoptions(precision=3, linewidth=1000) # print "transProb" # print transProb # print # print obs d = viterbi.Decoder(initialProb, transProb, obs) states, score = d.Decode(np.arange(len(lElts))) # add empty template (last one in state) lTemplates.append(None) print(states, score) #assign to each elt the template assigned by viterbi for i, elt, in enumerate(lElts): # try: print elt,elt.lX, lTemplates[states[i]] # except: print elt, elt.lX, 'no template' mytemplate = lTemplates[states[i]] elt.resetTemplate() if mytemplate is not None: elt.addTemplate(mytemplate) try: registeredPoints, lMissing, score = mytemplate.registration( elt) except: registeredPoints = None if registeredPoints: # print registeredPoints, lMissing , score if lMissing != []: registeredPoints.extend(zip(lMissing, lMissing)) registeredPoints.sort(key=lambda xy: xy[0].getValue()) lcuts = map(lambda refcut: refcut[1], registeredPoints) ## store features for the final parsing!!! # print elt, lcuts # elt.addVSeparator(mytemplate,lcuts) # return the new list with kleenePlus elts for next iteration ## reparse ?? YES using the featureSet given by viterbi -> create an objectClass per kleenePlus element: objects: sub tree # print lTemplates[0] # self.parseWithTemplate(lTemplates[0], lElts) # elt = template return score def getKleenePlusFeatures(self, lElts): """ select KleenePlus elements based on .next (only possible for unigrams) """ dFreqFeatures = {} dKleenePlusFeatures = {} lKleenePlus = [] for elt in lElts: for fea in elt.getSetofFeatures(): try: dFreqFeatures[fea] += 1 except KeyError: dFreqFeatures[fea] = 1 for nextE in elt.lnext: if fea in nextE.getSetofFeatures(): try: dKleenePlusFeatures[fea].append((elt, nextE)) except KeyError: dKleenePlusFeatures[fea] = [(elt, nextE)] for fea in dFreqFeatures: try: dKleenePlusFeatures[fea] if len(dKleenePlusFeatures[fea]) >= 0.5 * dFreqFeatures[fea]: lKleenePlus.append(fea) except: pass return lKleenePlus def computePatternScore(self, pattern): """ consider the frequency of the pattern and the weights of the features """ fScore = 0 #terminal if not isinstance(pattern, list): fScore += pattern.getCanonical().getWeight() else: for child in pattern: fScore += self.computePatternScore(child) # print 'score:',pattern ,fScore return fScore def analyzeListOfPatterns( self, lPatterns, dCA, ): """ select patterns with no ancestor other criteria ? if many with similar frequency: sort using computePatternScore? """ # reorder lPatterns considering feature weights and # of elements (for equally frequent patterns) # lPatterns.sort(key=lambda (x,y):self.computePatternScore(x),reverse=True) # for x,y in lPatterns: # print x,y,self.computePatternScore(x) dTemplatesTypes = {} for pattern, support in filter(lambda xy: xy[1] > 1, lPatterns): try: dCA[str(pattern)] bSkip = True except KeyError: bSkip = False # if step > 0 and len(pattern) == 1: # bSkip=True if not bSkip: template = treeTemplateClass() template.setPattern(pattern) template.buildTreeFromPattern(pattern) template.setType('lineTemplate') try: dTemplatesTypes[template.__class__.__name__].append( (pattern, support, template)) except KeyError: dTemplatesTypes[template.__class__.__name__] = [ (pattern, support, template) ] return dTemplatesTypes def processWithTemplate(self, lPattern, lPages): """ process sequence of pqges with given pattern create table """ lfPattern = [] for itemset in lPattern: fItemset = [] for item in itemset: f = featureObject() f.setName("x") f.setType(featureObject.NUMERICAL) f.setValue(item) f.setTH(self.THNUMERICAL) fItemset.append(f) lfPattern.append(fItemset) pattern = lfPattern print(pattern) ### in prodf: mytemplate given by page.getVerticalTemplates() mytemplate = treeTemplateClass() mytemplate.setPattern(pattern[0]) # registration provides best matching ## from registration matched: select the final cuts for i, p in enumerate(lPages): p.lFeatureForParsing = p.lf_XCut sys.stdout.flush() registeredPoints1, lMissing1, score1 = mytemplate.registration(p) if score1 >= 0: lfinalCuts = map( lambda xy: xy[1], filter(lambda xy: xy[0] != 'EMPTY', registeredPoints1)) print(p, 'final1:', lfinalCuts, lMissing1) p.addVerticalTemplate(mytemplate) p.addVSeparator(mytemplate, lfinalCuts) else: print('NO REGISTRATION') self.tagAsRegion(lPages) return 1 def tagAsRegion(self, lPages): """ create regions """ for page in lPages: if page.getNode(): # if several template ??? for template in page.getVerticalTemplates(): page.getdVSeparator(template).sort( key=lambda x: x.getValue()) print(page.getdVSeparator(template)) page.getNode().setProp('template', str(page.getdVSeparator(template))) XMinus = 1 prevcut = 10 # print page, page.getdVSeparator(template) for cut in page.getdVSeparator(template): cellNode = etree.Element('REGION') cellNode.set("y", str(prevcut)) ## it is better to avoid YMinus = 10 cellNode.set("x", str(XMinus)) cellNode.set("width", str(page.getWidth() - 2 * YMinus)) cellNode.set("height", str(cut.getValue() - prevcut)) page.getNode().addChild(cellNode) prevcut = cut.getValue() def generateTestOutput(self, lPages): """ create a run XML file """ root = etree.Element('DOCUMENT') self.evalData = etree.ElementTree(root) for page in lPages: domp = etree.Element('PAGE') domp.set('number', page.getAttribute('number')) root.addChild(domp) for sep in page.lVSeparator: print(page.lVSeparator) domsep = etree.Element('SeparatorRegion') domp.append(domsep) # domsep.setProp('x', str(sep[0].getValue())) domsep.set('x', str(sep[0])) #--- RUN --------------------------------------------------------------------------------------------------------------- def run(self, doc): """ take a set of line in a page and mine it """ self.doc = doc # use the lite version self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() if self.bManual: self.processWithTemplate(self.manualPattern, self.lPages) else: # self.mainMining(self.lPages) for page in self.lPages: print("page") lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: self.columnMining(table) self.addTagProcessToMetadata(self.doc) return self.doc