def TrainUsingCRF(xmls, preprocessor, trainer): CRFImpl = CRF() annotatedxmllist = list() for xmlname in xmls: fontdict = preprocessor.getFontDictionary(ET.parse("../TrainingData/xmls/cs/" + xmlname + ".xml")) #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>) annotatedxml = trainer.readAnnotatedXml('../TrainingData/annotated/' + xmlname + "_annotated") annotatedxmllist.append([annotatedxml, fontdict]) CRFImpl.domaintrain(annotatedxmllist) f = open("TrainedWeightsCRF", 'w') for weight in CRFImpl.trainedweights: f.write(str(weight) + "\n") f.close()
def TestUsingCRF(predictxmlname, location): CRF = getModelwithTrainedWeights() fontdict = preprocessor.getFontDictionary(ET.parse(location + predictxmlname + ".xml")) preprocessedxml = preprocessor.preprocessxml(location + predictxmlname + ".xml") #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>) alltables = list() for page in preprocessedxml: for col in page: if(len(col) < 2): continue for lineno in xrange(len(col)): col[lineno].append(lineno) predicted = CRF.predict(col, fontdict) for r in predicted: # if(r[0] == SparseType.OTHERSPARSE): print r[1].text + " *** Line no *** " + str(r[2]) + " -- " + str(r[0]) data = postprocessor.findTables(predicted) tables = data if(len(tables) == 0): continue for t in tables: alltables.append(t) for table in alltables: print "=============================================" for row in table: print row[1].text + " " + str(row[0])
def TestUsingCRF(predictxmlname, location, TDsvm=None): CRF = getModelwithTrainedWeights() fontdict = preprocessor.getFontDictionary( ET.parse(location + predictxmlname + ".xml")) preprocessedxml = preprocessor.preprocessxml( location + predictxmlname + ".xml" ) #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>) alltables = list() errorcount = 0 sparseerror = 0 ntlafterpostproc = 0 for page in preprocessedxml: for col in page: if (len(col) < 2): continue for tup in col: if (tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) for lineno in xrange(len(col)): col[lineno].append(lineno) result = CRF.predict(col, fontdict) predicted = result[0] errorcount += result[1] sparseerror += result[2] # for r in predicted: # if(r[0] == SparseType.OTHERSPARSE): # print r[1].text.encode('ascii','ignore') + " *** Line no *** " + str(r[2]) data = postprocessor.findTables(predicted) tables = data if (len(tables) == 0): continue for t in tables: alltables.append(t) if TDsvm is None: for table in alltables: print "=============================================" for row in table: if (int(row[0]) == SparseType.NONTABLELINE): ntlafterpostproc += 1 print row[1].text.encode('ascii', 'ignore') print "==============================================" else: for t in alltables: predicted = TDsvm.domainpredictforTableDecomposition(t) print "==============================================" for r in predicted[0]: if (r[0] == SparseType.HEADER): print r[1].text + " ---> HEADER " else: print r[1].text + " ---> DATA " print "==============================================" return [errorcount, sparseerror, ntlafterpostproc]
def TrainUsingCRF(xmls, preprocessor, trainer, xmlloc, annotatedxmlloc): CRFImpl = CRF() annotatedxmllist = list() for xmlname in xmls: fontdict = preprocessor.getFontDictionary( ET.parse(xmlloc + xmlname + ".xml") ) #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>) annotatedxml = trainer.readAnnotatedXml(annotatedxmlloc + xmlname + "_annotated") annotatedxmllist.append([annotatedxml, fontdict]) CRFImpl.domaintrain(annotatedxmllist) print CRFImpl.trainedweights f = open("TrainedWeightsCRF", 'w') for weight in CRFImpl.trainedweights: f.write(str(weight) + "\n") f.close()
def TestUsingCRF(predictxmlname, location, TDsvm = None): CRF = getModelwithTrainedWeights() fontdict = preprocessor.getFontDictionary(ET.parse(location + predictxmlname + ".xml")) preprocessedxml = preprocessor.preprocessxml(location + predictxmlname + ".xml") #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>) alltables = list() errorcount = 0 sparseerror = 0 ntlafterpostproc = 0 for page in preprocessedxml: for col in page: if(len(col) < 2): continue for tup in col: if(tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) for lineno in xrange(len(col)): col[lineno].append(lineno) result = CRF.predict(col, fontdict) predicted = result[0] errorcount += result[1] sparseerror += result[2] # for r in predicted: # if(r[0] == SparseType.OTHERSPARSE): # print r[1].text.encode('ascii','ignore') + " *** Line no *** " + str(r[2]) data = postprocessor.findTables(predicted) tables = data if(len(tables) == 0): continue for t in tables: alltables.append(t) if TDsvm is None: for table in alltables: print "=============================================" for row in table: if(int(row[0]) == SparseType.NONTABLELINE): ntlafterpostproc += 1 print row[1].text.encode('ascii','ignore') print "==============================================" else: for t in alltables: predicted = TDsvm.domainpredictforTableDecomposition(t) print "==============================================" for r in predicted[0]: if(r[0] == SparseType.HEADER): print r[1].text + " ---> HEADER " else: print r[1].text + " ---> DATA " print "==============================================" return [errorcount, sparseerror, ntlafterpostproc]
def getModelwithTrainedWeights(isCRF=True): trainedweights = list() if (isCRF): f = open("TrainedWeightsCRF", "r") for weight in f: trainedweights.append(float(weight)) f.close() CRFImpl = CRF(trainedweights) return CRFImpl else: f = open("TrainedWeightsLR", "r") for weight in f: trainedweights.append(float(weight)) f.close() LR = LogisticRegressor(trainedweights) return LR