Beispiel #1
0
    def analyse(self, xmlfilelist):
        parser = etree.XMLParser(recover=True)
        total_tp = 0
        total_fp = 0
        total_fn = 0
        fnfid = {}
        fpfid = {}
        cerrs = dict.fromkeys(['AUTHOR', 'COUNT_READ', 'COUNT_REPLY', 'TITLE', 'TM_POST', 'TM_REPLY'],0)
        cerrfiles = {}
        num = 0
        for xmlfile in xmlfilelist:
            #print(xmlfile)
            if num % 200 == 0 :
                print(num/len(xmlfilelist))
                print('REV.150')
                print('total_tp:', total_tp)
                print('total_fp:', total_fp)
                print('total_fn:', total_fn)
                print('fnfid:', fnfid)
                print('fpfid:', fpfid)

                print('cerrs:',cerrs)
                print('cerrfiles:', cerrfiles)
            num += 1
            root = etree.parse(xmlfile,parser).getroot()
            htmlnode = root[0]
            se = StructureExtractor()
            se.drawFeature(htmlnode)
            extractor = Extractor(htmlnode)
            extractor.process()
            self.detector.detect(htmlnode)
            (tp, fp, fn) = self.calcAccTitleLine(htmlnode)
            total_tp += tp
            total_fp += fp
            total_fn += fn
            if fn == 0:
                cerr = self.calcAccColumn(htmlnode)
                for k in cerr:
                    cerrs[k] += 1
                if len(cerr)>0:
                    cerrfiles[path.basename(xmlfile)] = (cerr, root.attrib['fid'])
            if fp > 0:
                #print(xmlfile)
                fpfid[root.attrib['fid']] = fpfid.get(root.attrib['fid'],0) + 1
            if fn > 0:
                print(xmlfile)
                fnfid[root.attrib['fid']] = fnfid.get(root.attrib['fid'],0) + 1
        print('REV.150')
        print('total_tp:', total_tp)
        print('total_fp:', total_fp)
        print('total_fn:', total_fn)
        print('fnfid:', fnfid)
        print('fpfid:', fpfid)

        print('cerrs:',cerrs)
        print('cerrfiles:', cerrfiles)
Beispiel #2
0
 def performAnalyse(self, htmlnode):
     se = StructureExtractor()
     se.drawFeature(htmlnode)
     extractor = Extractor(htmlnode)
     extractor.process()
     detector = Detector()
     detector.detect(htmlnode)
     (tp, fp, fn) = self.calcAccTitleLine(htmlnode)
     cerr = self.calcAccColumn(htmlnode)
     return (tp, fp, fn, cerr)
Beispiel #3
0
 def process(self,root):
     se = StructureExtractor()
     se.drawFeature(root)
     self.extractor = Extractor(root)
     self.extractor.process()
     #self.crossP = self.extractor.string2sparse(self.htmlnode.attrib['crossP'],self.extractor.totalheight+1)
     Config.init()
     detector = Detector()#Config.nbTLstr)
     detector.detect(root)
     self.toolbox.setDetector(detector)
Beispiel #4
0
class MainWindow(QtGui.QScrollArea):
    def __init__(self):
        QtGui.QScrollArea.__init__(self)
        self.setGeometry(300, 40, 600, 800)
        self.setWindowTitle('WebWireFrameVisualizer')
        self.show()
        
        self.toolbox = ToolBox()

        self.fromInternet()

        #root = self.fromLocal()

    def fromInternet(self):
        self.crawler = crawlWithGeometry.Crawler()
        self.crawler.crawlSinglePage('http://www.19lou.com/forum-72-1.html', self.loadComplete)

    def initUI(self,root):

        ptl = root.findall('.//*[@predict!="0"]')
        print('num of predicted pl:', len(ptl))
        #ana = Analyser()
        #print(ana.performAnalyse(root))
        
        self.visualizer = WebWireFrameVisualizer(root, self.toolbox, self)

        layout = QtGui.QHBoxLayout()
        #self.visualizer.setGeometry(50, 30, self.extractor.totalwidth, self.extractor.totalheight)
        layout.addWidget(self.visualizer)
        layout.setSizeConstraint(QtGui.QLayout.SetNoConstraint)
        layout.setStretchFactor(self.visualizer,1)

        frame = QtGui.QWidget(self)
        frame.setLayout(layout)
        frame.setGeometry(0,0,int(root.attrib['width'])+500,int(root.attrib['height'])+1000)
        frame.show()
        self.setWidget(frame)
        self.toolbox.show()
        pylab.show()

    def fromLocal(self):
        parser = etree.XMLParser(recover=True)
        # err file: 280,1640,9382, 6083, 11348, 10212, 11175, 1459, 5722
        tree = etree.parse('../data/pages_crawled1125/with_label_120314/18.txt.xml',parser=parser)
        print('fid:',tree.getroot().attrib['fid'])
        root = tree.find('//html')
        #self.process(root)
        self.initUI(root)
        return root
        
    def loadComplete(self,htmlstring):
        print('loadComplete')
        root = html.fromstring(htmlstring)
        cleaner = cleanpage.PageCleaner()
        root = cleaner.clean(root)
        #print(html.tostring(tree,encoding='unicode',method='xml',pretty_print=True))
        self.process(root)
        self.initUI(root)
        return root
    
    def process(self,root):
        se = StructureExtractor()
        se.drawFeature(root)
        self.extractor = Extractor(root)
        self.extractor.process()
        #self.crossP = self.extractor.string2sparse(self.htmlnode.attrib['crossP'],self.extractor.totalheight+1)
        Config.init()
        detector = Detector()#Config.nbTLstr)
        detector.detect(root)
        self.toolbox.setDetector(detector)
Beispiel #5
0
    def train(self, xmlfile_list):
        parser = etree.XMLParser(recover=True)
        # model for title line confirmation, using naive bayes
        ptagid = [{},{}]
        ppos = [[0,0],[0,0]]
        plen = [[],[]]
        psize = [[],[]]
        pleft = [[],[]]
        pwidth = [[],[]]
        pheight = [[],[]]

        for xmlfile in xmlfile_list:
            print(xmlfile)
            root = etree.parse(xmlfile,parser).getroot()
            htmlnode = root[0]
            se = StructureExtractor()
            se.drawFeature(htmlnode)
            extractor = Extractor(htmlnode)
            extractor.process()
            self.detect(htmlnode)
            
            for node in htmlnode.findall('.//*[@predict="{}"]'.format(LABEL['TITLE_LINE'])):                
                if 'label' in node.attrib and node.attrib['label'] == LABEL['TITLE_LINE']:
                    # tagid feature: discrete
                    ptagid[1][node.tag] = ptagid[1].get(node.tag,0) + 1
                    # position: first or not first
                    if node.attrib['position'] == '0':
                        ppos[1][0] += 1
                    else:
                        ppos[1][1] += 1
                    # length: Gaussian
                    plen[1].append(int(node.attrib['length']))
                    # size: Gaussian
                    psize[1].append(int(node.attrib['size']))
                    # left: Gaussian
                    pleft[1].append(int(node.attrib['left']))
                    # width: Gaussian
                    pwidth[1].append(int(node.attrib['width']))
                    # height: Gaussian
                    pheight[1].append(int(node.attrib['height']))
                else:
                    ptagid[0][node.tag] = ptagid[0].get(node.tag,0) + 1
                    if node.attrib['position'] == '0':
                        ppos[0][0] += 1
                    else:
                        ppos[0][1] += 1
                    plen[0].append(int(node.attrib['length']))
                    psize[0].append(int(node.attrib['size']))
                    pleft[0].append(int(node.attrib['left']))
                    pwidth[0].append(int(node.attrib['width']))
                    pheight[0].append(int(node.attrib['height']))
        # tagid feature: discrete
        tsum = sum([ptagid[0][k] for k in ptagid[0]])
        self.mtagid = []
        self.mtagid.append({k:ptagid[0][k]/tsum for k in ptagid[0]})
        tsum = sum([ptagid[1][k] for k in ptagid[1]])
        self.mtagid.append({k:ptagid[1][k]/tsum for k in ptagid[1]})

        # position: first or not first
        self.mpos = [[ppos[0][0]/sum(ppos[0]), ppos[0][1]/sum(ppos[0])], [ppos[1][0]/sum(ppos[1]), ppos[1][1]/sum(ppos[1])]]

        # length: Gaussian
        arr0 = scipy.array(plen[0])
        arr1 = scipy.array(plen[1])
        self.mlen=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # size: Gaussian
        arr0 = scipy.array(psize[0])
        arr1 = scipy.array(psize[1])
        self.msize=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]
        
        # left: Gaussian
        arr0 = scipy.array(pleft[0])
        arr1 = scipy.array(pleft[1])
        self.mleft=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # width: Gaussian
        arr0 = scipy.array(pwidth[0])
        arr1 = scipy.array(pwidth[1])
        self.mwidth=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # height: Gaussian
        arr0 = scipy.array(pheight[0])
        arr1 = scipy.array(pheight[1])
        self.mheight=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        return (self.mtagid, self.mpos, self.mlen, self.msize, self.mleft, self.mwidth, self.mheight)