def __init__(self, trainedweights = list()): self.Features = LRFeatures() self.trainedweights = trainedweights self.learningrate = Constants.LR_LEARNING_RATE
class LogisticRegressor: def __init__(self, trainedweights = list()): self.Features = LRFeatures() self.trainedweights = trainedweights self.learningrate = Constants.LR_LEARNING_RATE def domaintrain(self, annotatedxmllist): collist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if(len(col) < 2): continue trainfeatures = list() for i in xrange(0, len(col)): trainfeatures.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1])) collist.append([col, trainfeatures]) self.train(collist) def domainpredict(self, col, fontdict): for i in xrange(0, len(col)): featurevector = self.Features.domainfindfeatureFunction(i, col, fontdict) col[i][0] = self.predict(featurevector) return col def predict(self, featurevector): sigmoid = self.getSigmoid(featurevector, self.trainedweights) if(sigmoid > 0.5): return SparseType.OTHERSPARSE return SparseType.NONSPARSE def train(self, collist): for _ in xrange(len(collist[0][1][0])): self.trainedweights.append(random.uniform(-0.1, 0.1)) for r in range(Constants.LR_EPOCHS): errorcount = 0.0 sparseerrorcount = 0.0 totalcount = 0.0 for colnum in range(len(collist)): for lineno in xrange(len(collist[colnum][0])): totalcount += 1 sigmoidExpected = 0 inputVector = collist[colnum][1][lineno] if(int(collist[colnum][0][lineno][0]) == SparseType.OTHERSPARSE): sigmoidExpected = 1 self.stochasticGradientDescent(inputVector, self.trainedweights, sigmoidExpected, self.learningrate); predicted = self.predict(inputVector) if(predicted != int(collist[colnum][0][lineno][0])): errorcount += 1 if(int(collist[colnum][0][lineno][0]) == SparseType.OTHERSPARSE): #for sparse error count # domain specific sparseerrorcount += 1 self.learningrate = Constants.INITIAL_LEARNING_RATE * math.exp(-(float(r)/Constants.CRF_NUM_EPOCHS)) print "Iteration " + str(r) + " Learning Rate " + str(self.learningrate) + " Count= " + str(totalcount) +" Total Error = " + str(errorcount) + " Sparse Error = " + str(sparseerrorcount) def stochasticGradientDescent(self,inputVector, weightVector, sigmoidExpected, eta): sigmoid = self.getSigmoid(inputVector, weightVector) #update weights for r in range(len(weightVector)): weightVector[r] = weightVector[r] + eta*(sigmoidExpected - sigmoid)*sigmoid*(1-sigmoid)*inputVector[r] def getSigmoid(self,inputVector, weightVector): net = 0 for r in range(len(inputVector)): net = net + inputVector[r]*weightVector[r] sigmoid = 1/(1+math.exp(-net)) return sigmoid
def __init__(self, trainedweights=list()): self.Features = LRFeatures() self.TDFeatures = LRTDFeatures() self.trainedweights = trainedweights self.learningrate = Constants.LR_LEARNING_RATE
class LogisticRegressor: def __init__(self, trainedweights = list()): self.Features = LRFeatures() self.TDFeatures = LRTDFeatures() self.trainedweights = trainedweights self.learningrate = Constants.LR_LEARNING_RATE def domaintrain(self, annotatedxmllist): collist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if(len(col) < 2): continue for tup in col: if(tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) trainfeatures = list() for i in xrange(0, len(col)): trainfeatures.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1])) collist.append([col, trainfeatures]) self.train(collist) def domainpredict(self, col, fontdict): errorcount = 0 sparseerror = 0 for i in xrange(0, len(col)): featurevector = self.Features.domainfindfeatureFunction(i, col, fontdict) predicted = self.predict(featurevector) if((predicted) != int(col[i][0])): errorcount += 1 if((predicted) == SparseType.NONTABLELINE): sparseerror += 1 col[i][0] = predicted return [col,errorcount, sparseerror] def domaintrainforTableDecomposition(self, tableslist): datalist = list() labelslist = list() for table in tableslist: for i in xrange(0, len(table)): labelslist.append(table[i][0]) datalist.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) self.trainforTD(datalist, labelslist) def domainpredictforTableDecomposition(self, table): errorcount = 0 sparseerror = 0 for i in xrange(0, len(table)): test_list = self.TDFeatures.domainfindfeatureFunction(i, table, None) predicted = self.predictforTD(test_list) if((predicted) != int(table[i][0])): errorcount += 1 if((predicted) == SparseType.HEADER): sparseerror += 1 table[i][0] = predicted return [table, errorcount, sparseerror] def predict(self, featurevector): sigmoid = self.getSigmoid(featurevector, self.trainedweights) if(sigmoid > 0.5): return SparseType.TABLELINE return SparseType.NONTABLELINE def predictforTD(self, featurevector): sigmoid = self.getSigmoid(featurevector, self.trainedweights) if(sigmoid > 0.5): return SparseType.HEADER return SparseType.DATA def trainforTD(self, datalist, labelslist): self.trainedweights = list() for _ in xrange(len(datalist[0])): self.trainedweights.append(random.uniform(-0.1, 0.1)) for r in range(Constants.LR_EPOCHS): errorcount = 0.0 sparseerrorcount = 0.0 totalcount = 0.0 for datarow in xrange(len(datalist)): totalcount += 1 sigmoidExpected = 0 inputVector = datalist[datarow] if(int(labelslist[datarow]) == SparseType.HEADER): sigmoidExpected = 1 self.stochasticGradientDescent(inputVector, self.trainedweights, sigmoidExpected, self.learningrate); predicted = self.predictforTD(inputVector) if(predicted != int(labelslist[datarow])): errorcount += 1 if(int(labelslist[datarow]) == SparseType.HEADER): #for sparse error count # domain specific sparseerrorcount += 1 self.learningrate = Constants.INITIAL_LEARNING_RATE * math.exp(-(float(r)/Constants.CRF_NUM_EPOCHS)) print "Iteration " + str(r) + " Learning Rate " + str(self.learningrate) + " Count= " + str(totalcount) +" Total Error = " + str(errorcount) + " Sparse Error = " + str(sparseerrorcount) def train(self, collist): self.trainedweights = list() for _ in xrange(len(collist[0][1][0])): self.trainedweights.append(random.uniform(-0.1, 0.1)) for r in range(Constants.LR_EPOCHS): errorcount = 0.0 sparseerrorcount = 0.0 totalcount = 0.0 for colnum in range(len(collist)): for lineno in xrange(len(collist[colnum][0])): totalcount += 1 sigmoidExpected = 0 inputVector = collist[colnum][1][lineno] if(int(collist[colnum][0][lineno][0]) == SparseType.TABLELINE): sigmoidExpected = 1 self.stochasticGradientDescent(inputVector, self.trainedweights, sigmoidExpected, self.learningrate); predicted = self.predict(inputVector) if(predicted != int(collist[colnum][0][lineno][0])): errorcount += 1 if(int(collist[colnum][0][lineno][0]) == SparseType.TABLELINE): #for sparse error count # domain specific sparseerrorcount += 1 self.learningrate = Constants.INITIAL_LEARNING_RATE * math.exp(-(float(r)/Constants.CRF_NUM_EPOCHS)) print "Iteration " + str(r) + " Learning Rate " + str(self.learningrate) + " Count= " + str(totalcount) +" Total Error = " + str(errorcount) + " Sparse Error = " + str(sparseerrorcount) def stochasticGradientDescent(self,inputVector, weightVector, sigmoidExpected, eta): sigmoid = self.getSigmoid(inputVector, weightVector) #update weights for r in range(len(weightVector)): weightVector[r] = weightVector[r] + eta*(sigmoidExpected - sigmoid)*sigmoid*(1-sigmoid)*inputVector[r] def getSigmoid(self,inputVector, weightVector): net = 0 for r in range(len(inputVector)): net = net + inputVector[r]*weightVector[r] sigmoid = 1/(1+math.exp(-net)) return sigmoid
class LogisticRegressor: def __init__(self, trainedweights=list()): self.Features = LRFeatures() self.TDFeatures = LRTDFeatures() self.trainedweights = trainedweights self.learningrate = Constants.LR_LEARNING_RATE def domaintrain(self, annotatedxmllist): collist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if (len(col) < 2): continue for tup in col: if (tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) trainfeatures = list() for i in xrange(0, len(col)): trainfeatures.append( self.Features.domainfindfeatureFunction( i, col, annotatedxml[1])) collist.append([col, trainfeatures]) self.train(collist) def domainpredict(self, col, fontdict): errorcount = 0 sparseerror = 0 for i in xrange(0, len(col)): featurevector = self.Features.domainfindfeatureFunction( i, col, fontdict) predicted = self.predict(featurevector) if ((predicted) != int(col[i][0])): errorcount += 1 if ((predicted) == SparseType.NONTABLELINE): sparseerror += 1 col[i][0] = predicted return [col, errorcount, sparseerror] def domaintrainforTableDecomposition(self, tableslist): datalist = list() labelslist = list() for table in tableslist: for i in xrange(0, len(table)): labelslist.append(table[i][0]) datalist.append( self.TDFeatures.domainfindfeatureFunction(i, table, None)) self.trainforTD(datalist, labelslist) def domainpredictforTableDecomposition(self, table): errorcount = 0 sparseerror = 0 for i in xrange(0, len(table)): test_list = self.TDFeatures.domainfindfeatureFunction( i, table, None) predicted = self.predictforTD(test_list) if ((predicted) != int(table[i][0])): errorcount += 1 if ((predicted) == SparseType.HEADER): sparseerror += 1 table[i][0] = predicted return [table, errorcount, sparseerror] def predict(self, featurevector): sigmoid = self.getSigmoid(featurevector, self.trainedweights) if (sigmoid > 0.5): return SparseType.TABLELINE return SparseType.NONTABLELINE def predictforTD(self, featurevector): sigmoid = self.getSigmoid(featurevector, self.trainedweights) if (sigmoid > 0.5): return SparseType.HEADER return SparseType.DATA def trainforTD(self, datalist, labelslist): self.trainedweights = list() for _ in xrange(len(datalist[0])): self.trainedweights.append(random.uniform(-0.1, 0.1)) for r in range(Constants.LR_EPOCHS): errorcount = 0.0 sparseerrorcount = 0.0 totalcount = 0.0 for datarow in xrange(len(datalist)): totalcount += 1 sigmoidExpected = 0 inputVector = datalist[datarow] if (int(labelslist[datarow]) == SparseType.HEADER): sigmoidExpected = 1 self.stochasticGradientDescent(inputVector, self.trainedweights, sigmoidExpected, self.learningrate) predicted = self.predictforTD(inputVector) if (predicted != int(labelslist[datarow])): errorcount += 1 if (int(labelslist[datarow]) == SparseType.HEADER ): #for sparse error count # domain specific sparseerrorcount += 1 self.learningrate = Constants.INITIAL_LEARNING_RATE * math.exp( -(float(r) / Constants.CRF_NUM_EPOCHS)) print "Iteration " + str(r) + " Learning Rate " + str( self.learningrate ) + " Count= " + str(totalcount) + " Total Error = " + str( errorcount) + " Sparse Error = " + str(sparseerrorcount) def train(self, collist): self.trainedweights = list() for _ in xrange(len(collist[0][1][0])): self.trainedweights.append(random.uniform(-0.1, 0.1)) for r in range(Constants.LR_EPOCHS): errorcount = 0.0 sparseerrorcount = 0.0 totalcount = 0.0 for colnum in range(len(collist)): for lineno in xrange(len(collist[colnum][0])): totalcount += 1 sigmoidExpected = 0 inputVector = collist[colnum][1][lineno] if (int(collist[colnum][0][lineno][0]) == SparseType.TABLELINE): sigmoidExpected = 1 self.stochasticGradientDescent(inputVector, self.trainedweights, sigmoidExpected, self.learningrate) predicted = self.predict(inputVector) if (predicted != int(collist[colnum][0][lineno][0])): errorcount += 1 if (int(collist[colnum][0][lineno] [0]) == SparseType.TABLELINE ): #for sparse error count # domain specific sparseerrorcount += 1 self.learningrate = Constants.INITIAL_LEARNING_RATE * math.exp( -(float(r) / Constants.CRF_NUM_EPOCHS)) print "Iteration " + str(r) + " Learning Rate " + str( self.learningrate ) + " Count= " + str(totalcount) + " Total Error = " + str( errorcount) + " Sparse Error = " + str(sparseerrorcount) def stochasticGradientDescent(self, inputVector, weightVector, sigmoidExpected, eta): sigmoid = self.getSigmoid(inputVector, weightVector) #update weights for r in range(len(weightVector)): weightVector[r] = weightVector[r] + eta * ( sigmoidExpected - sigmoid) * sigmoid * (1 - sigmoid) * inputVector[r] def getSigmoid(self, inputVector, weightVector): net = 0 for r in range(len(inputVector)): net = net + inputVector[r] * weightVector[r] sigmoid = 1 / (1 + math.exp(-net)) return sigmoid