Example #1
0
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.svminstance = SVM()
        
    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if(len(col) < 2):
                        continue
                    
                    for i in xrange(0, len(col)):
                        if(int(col[i][0]) == SparseType.OTHERSPARSE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1]))
        self.train(datalist, labelslist)
        
    def domainpredict(self, col, fontdict):
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) 
            if(self.predict(test_list) == 'S'):
                col[i][0] = SparseType.OTHERSPARSE
            else:
                col[i][0] = SparseType.NONSPARSE
        return col
        
    def train(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree = 5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 2)
        #print result.getPredictedLabels()
        
    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]
 def __init__(self):
     self.Features = SVMFeatures()
     self.TDFeatures = SVMTDFeatures()
     self.svminstance = SVM()
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.TDFeatures = SVMTDFeatures()
        self.svminstance = SVM()
        
    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if(len(col) < 2):
                        continue
                    for tup in col:
                        if(tup[1].text is None or tup[1].text.strip() == ''):
                            col.remove(tup)
                    for i in xrange(0, len(col)):
                        if(int(col[i][0]) == SparseType.TABLELINE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1]))
        self.train(datalist, labelslist)
    
    def domaintrainforTableDecomposition(self, tableslist):
        labelslist = list()
        datalist = list()
        for table in tableslist:
            for i in xrange(0, len(table)):
                if(int(table[i][0]) == SparseType.HEADER):
                    labelslist.append("HEADER")
                else:
                    labelslist.append("DATA")
                datalist.append(self.TDFeatures.domainfindfeatureFunction(i, table, None))
        self.trainforTD(datalist, labelslist)
        
    def domainpredictforTableDecomposition(self, table): 
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(table)):
            test_list = list()
            test_list.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) 
            if(self.predict(test_list) == 'HEADER'):
                predicted = SparseType.HEADER
            else:
                predicted = SparseType.DATA
            if((predicted) != int(table[i][0])):
                errorcount += 1 
                if((predicted) == SparseType.HEADER):
                    sparseerror += 1
            
            table[i][0] = predicted
            
        return [table, errorcount, sparseerror]
               
    def domainpredict(self, col, fontdict):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) 
            if(self.predict(test_list) == 'S'):
                predicted = SparseType.TABLELINE
            else:
                predicted = SparseType.NONTABLELINE
            if((predicted) != int(col[i][0])):
                errorcount += 1 
                if((predicted) == SparseType.NONTABLELINE):
                    sparseerror += 1
            col[i][0] = predicted
        
        return [col, errorcount, sparseerror]
        
    def train(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree = 5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 5)
        #print result
        
    def trainforTD(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 6)
        #print result    
        
    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]
    
    def save(self, filename):
        self.svminstance.save(filename)
 def __init__(self):
     self.Features = SVMFeatures()
     self.TDFeatures = SVMTDFeatures()
     self.svminstance = SVM()
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.TDFeatures = SVMTDFeatures()
        self.svminstance = SVM()

    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if (len(col) < 2):
                        continue
                    for tup in col:
                        if (tup[1].text is None or tup[1].text.strip() == ''):
                            col.remove(tup)
                    for i in xrange(0, len(col)):
                        if (int(col[i][0]) == SparseType.TABLELINE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(
                            self.Features.domainfindfeatureFunction(
                                i, col, annotatedxml[1]))
        self.train(datalist, labelslist)

    def domaintrainforTableDecomposition(self, tableslist):
        labelslist = list()
        datalist = list()
        for table in tableslist:
            for i in xrange(0, len(table)):
                if (int(table[i][0]) == SparseType.HEADER):
                    labelslist.append("HEADER")
                else:
                    labelslist.append("DATA")
                datalist.append(
                    self.TDFeatures.domainfindfeatureFunction(i, table, None))
        self.trainforTD(datalist, labelslist)

    def domainpredictforTableDecomposition(self, table):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(table)):
            test_list = list()
            test_list.append(
                self.TDFeatures.domainfindfeatureFunction(i, table, None))
            if (self.predict(test_list) == 'HEADER'):
                predicted = SparseType.HEADER
            else:
                predicted = SparseType.DATA
            if ((predicted) != int(table[i][0])):
                errorcount += 1
                if ((predicted) == SparseType.HEADER):
                    sparseerror += 1

            table[i][0] = predicted

        return [table, errorcount, sparseerror]

    def domainpredict(self, col, fontdict):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(
                self.Features.domainfindfeatureFunction(i, col, fontdict))
            if (self.predict(test_list) == 'S'):
                predicted = SparseType.TABLELINE
            else:
                predicted = SparseType.NONTABLELINE
            if ((predicted) != int(col[i][0])):
                errorcount += 1
                if ((predicted) == SparseType.NONTABLELINE):
                    sparseerror += 1
            col[i][0] = predicted

        return [col, errorcount, sparseerror]

    def train(self, datalist, labelslist):
        data = SparseDataSet(datalist, L=labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree=5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 5)
        #print result

    def trainforTD(self, datalist, labelslist):
        data = SparseDataSet(datalist, L=labelslist)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 6)
        #print result

    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]

    def save(self, filename):
        self.svminstance.save(filename)