Ejemplo n.º 1
0
    def analyse(self, xmlfilelist):
        parser = etree.XMLParser(recover=True)
        total_tp = 0
        total_fp = 0
        total_fn = 0
        fnfid = {}
        fpfid = {}
        cerrs = dict.fromkeys(['AUTHOR', 'COUNT_READ', 'COUNT_REPLY', 'TITLE', 'TM_POST', 'TM_REPLY'],0)
        cerrfiles = {}
        num = 0
        for xmlfile in xmlfilelist:
            #print(xmlfile)
            if num % 200 == 0 :
                print(num/len(xmlfilelist))
                print('REV.150')
                print('total_tp:', total_tp)
                print('total_fp:', total_fp)
                print('total_fn:', total_fn)
                print('fnfid:', fnfid)
                print('fpfid:', fpfid)

                print('cerrs:',cerrs)
                print('cerrfiles:', cerrfiles)
            num += 1
            root = etree.parse(xmlfile,parser).getroot()
            htmlnode = root[0]
            se = StructureExtractor()
            se.drawFeature(htmlnode)
            extractor = Extractor(htmlnode)
            extractor.process()
            self.detector.detect(htmlnode)
            (tp, fp, fn) = self.calcAccTitleLine(htmlnode)
            total_tp += tp
            total_fp += fp
            total_fn += fn
            if fn == 0:
                cerr = self.calcAccColumn(htmlnode)
                for k in cerr:
                    cerrs[k] += 1
                if len(cerr)>0:
                    cerrfiles[path.basename(xmlfile)] = (cerr, root.attrib['fid'])
            if fp > 0:
                #print(xmlfile)
                fpfid[root.attrib['fid']] = fpfid.get(root.attrib['fid'],0) + 1
            if fn > 0:
                print(xmlfile)
                fnfid[root.attrib['fid']] = fnfid.get(root.attrib['fid'],0) + 1
        print('REV.150')
        print('total_tp:', total_tp)
        print('total_fp:', total_fp)
        print('total_fn:', total_fn)
        print('fnfid:', fnfid)
        print('fpfid:', fpfid)

        print('cerrs:',cerrs)
        print('cerrfiles:', cerrfiles)
Ejemplo n.º 2
0
 def process(self,root):
     se = StructureExtractor()
     se.drawFeature(root)
     self.extractor = Extractor(root)
     self.extractor.process()
     #self.crossP = self.extractor.string2sparse(self.htmlnode.attrib['crossP'],self.extractor.totalheight+1)
     Config.init()
     detector = Detector()#Config.nbTLstr)
     detector.detect(root)
     self.toolbox.setDetector(detector)
Ejemplo n.º 3
0
 def performAnalyse(self, htmlnode):
     se = StructureExtractor()
     se.drawFeature(htmlnode)
     extractor = Extractor(htmlnode)
     extractor.process()
     detector = Detector()
     detector.detect(htmlnode)
     (tp, fp, fn) = self.calcAccTitleLine(htmlnode)
     cerr = self.calcAccColumn(htmlnode)
     return (tp, fp, fn, cerr)
Ejemplo n.º 4
0
    def train(self, xmlfile_list):
        parser = etree.XMLParser(recover=True)
        # model for title line confirmation, using naive bayes
        ptagid = [{},{}]
        ppos = [[0,0],[0,0]]
        plen = [[],[]]
        psize = [[],[]]
        pleft = [[],[]]
        pwidth = [[],[]]
        pheight = [[],[]]

        for xmlfile in xmlfile_list:
            print(xmlfile)
            root = etree.parse(xmlfile,parser).getroot()
            htmlnode = root[0]
            se = StructureExtractor()
            se.drawFeature(htmlnode)
            extractor = Extractor(htmlnode)
            extractor.process()
            self.detect(htmlnode)
            
            for node in htmlnode.findall('.//*[@predict="{}"]'.format(LABEL['TITLE_LINE'])):                
                if 'label' in node.attrib and node.attrib['label'] == LABEL['TITLE_LINE']:
                    # tagid feature: discrete
                    ptagid[1][node.tag] = ptagid[1].get(node.tag,0) + 1
                    # position: first or not first
                    if node.attrib['position'] == '0':
                        ppos[1][0] += 1
                    else:
                        ppos[1][1] += 1
                    # length: Gaussian
                    plen[1].append(int(node.attrib['length']))
                    # size: Gaussian
                    psize[1].append(int(node.attrib['size']))
                    # left: Gaussian
                    pleft[1].append(int(node.attrib['left']))
                    # width: Gaussian
                    pwidth[1].append(int(node.attrib['width']))
                    # height: Gaussian
                    pheight[1].append(int(node.attrib['height']))
                else:
                    ptagid[0][node.tag] = ptagid[0].get(node.tag,0) + 1
                    if node.attrib['position'] == '0':
                        ppos[0][0] += 1
                    else:
                        ppos[0][1] += 1
                    plen[0].append(int(node.attrib['length']))
                    psize[0].append(int(node.attrib['size']))
                    pleft[0].append(int(node.attrib['left']))
                    pwidth[0].append(int(node.attrib['width']))
                    pheight[0].append(int(node.attrib['height']))
        # tagid feature: discrete
        tsum = sum([ptagid[0][k] for k in ptagid[0]])
        self.mtagid = []
        self.mtagid.append({k:ptagid[0][k]/tsum for k in ptagid[0]})
        tsum = sum([ptagid[1][k] for k in ptagid[1]])
        self.mtagid.append({k:ptagid[1][k]/tsum for k in ptagid[1]})

        # position: first or not first
        self.mpos = [[ppos[0][0]/sum(ppos[0]), ppos[0][1]/sum(ppos[0])], [ppos[1][0]/sum(ppos[1]), ppos[1][1]/sum(ppos[1])]]

        # length: Gaussian
        arr0 = scipy.array(plen[0])
        arr1 = scipy.array(plen[1])
        self.mlen=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # size: Gaussian
        arr0 = scipy.array(psize[0])
        arr1 = scipy.array(psize[1])
        self.msize=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]
        
        # left: Gaussian
        arr0 = scipy.array(pleft[0])
        arr1 = scipy.array(pleft[1])
        self.mleft=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # width: Gaussian
        arr0 = scipy.array(pwidth[0])
        arr1 = scipy.array(pwidth[1])
        self.mwidth=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # height: Gaussian
        arr0 = scipy.array(pheight[0])
        arr1 = scipy.array(pheight[1])
        self.mheight=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        return (self.mtagid, self.mpos, self.mlen, self.msize, self.mleft, self.mwidth, self.mheight)