def train(self, xmlfile_list):
        """ Calculate the column identification model given the columned xmlfile list
        """
        parser = etree.XMLParser(recover=True)
        featpile = []
        labelpile = []
        print("loading")
        for xmlfile in xmlfile_list:
            root = etree.parse(xmlfile, parser).getroot()
            htmlnode = root[0]
            feats, labels = self.getAllFeature(htmlnode)
            if len(feats) > 0:
                featpile.append(self.formatFeature(feats))
                labelpile.append(array([LABEL[k] for k in labels]))

        allFeats = vstack(featpile)
        allLabels = hstack(labelpile)
        print(len(allFeats))
        print(len(allLabels))
        print("load complete")
        models = {}
        for key in self.label_set:
            print("training", key)
            keyLabel = numpy.int8(allLabels == LABEL[key])
            models[key] = adaboost.train(allFeats, keyLabel, balance_factor=15.0)
        return models
Exemple #2
0
from scipy import *
from zhiqiang import adaboost

##tset = array([[1,2,3],[0,5,1]])
##lset = array([1,0])
##param = adaboost.train(tset,lset)
##testset = array([[9,10,-1],[-1,2,3]])
##pls = adaboost.predict(testset,param)

from scipy import io
dataset = io.loadmat('G:/dataset/mnist/mnist_all.mat')
trainset = concatenate((dataset['train0'],dataset['train1']))
trainset = trainset.astype('float')
(n1,m) = dataset['train1'].shape
(n0,m) = dataset['train0'].shape
labels = ones(n1+n0)
labels[:n0] = -1
param = adaboost.train(trainset, labels, 2)
testset = concatenate((dataset['test0'],dataset['test1']))
testset = testset.astype('float')
pls = adaboost.predict(testset,param)
tn0 = len(dataset['test0'])
tn = len(testset)
tls = ones(tn)
tls[:tn0] = -1
print(sum(tls == pls))