def analyse(self, xmlfilelist): parser = etree.XMLParser(recover=True) total_tp = 0 total_fp = 0 total_fn = 0 fnfid = {} fpfid = {} cerrs = dict.fromkeys(['AUTHOR', 'COUNT_READ', 'COUNT_REPLY', 'TITLE', 'TM_POST', 'TM_REPLY'],0) cerrfiles = {} num = 0 for xmlfile in xmlfilelist: #print(xmlfile) if num % 200 == 0 : print(num/len(xmlfilelist)) print('REV.150') print('total_tp:', total_tp) print('total_fp:', total_fp) print('total_fn:', total_fn) print('fnfid:', fnfid) print('fpfid:', fpfid) print('cerrs:',cerrs) print('cerrfiles:', cerrfiles) num += 1 root = etree.parse(xmlfile,parser).getroot() htmlnode = root[0] se = StructureExtractor() se.drawFeature(htmlnode) extractor = Extractor(htmlnode) extractor.process() self.detector.detect(htmlnode) (tp, fp, fn) = self.calcAccTitleLine(htmlnode) total_tp += tp total_fp += fp total_fn += fn if fn == 0: cerr = self.calcAccColumn(htmlnode) for k in cerr: cerrs[k] += 1 if len(cerr)>0: cerrfiles[path.basename(xmlfile)] = (cerr, root.attrib['fid']) if fp > 0: #print(xmlfile) fpfid[root.attrib['fid']] = fpfid.get(root.attrib['fid'],0) + 1 if fn > 0: print(xmlfile) fnfid[root.attrib['fid']] = fnfid.get(root.attrib['fid'],0) + 1 print('REV.150') print('total_tp:', total_tp) print('total_fp:', total_fp) print('total_fn:', total_fn) print('fnfid:', fnfid) print('fpfid:', fpfid) print('cerrs:',cerrs) print('cerrfiles:', cerrfiles)
def performAnalyse(self, htmlnode): se = StructureExtractor() se.drawFeature(htmlnode) extractor = Extractor(htmlnode) extractor.process() detector = Detector() detector.detect(htmlnode) (tp, fp, fn) = self.calcAccTitleLine(htmlnode) cerr = self.calcAccColumn(htmlnode) return (tp, fp, fn, cerr)
def process(self,root): se = StructureExtractor() se.drawFeature(root) self.extractor = Extractor(root) self.extractor.process() #self.crossP = self.extractor.string2sparse(self.htmlnode.attrib['crossP'],self.extractor.totalheight+1) Config.init() detector = Detector()#Config.nbTLstr) detector.detect(root) self.toolbox.setDetector(detector)
class MainWindow(QtGui.QScrollArea): def __init__(self): QtGui.QScrollArea.__init__(self) self.setGeometry(300, 40, 600, 800) self.setWindowTitle('WebWireFrameVisualizer') self.show() self.toolbox = ToolBox() self.fromInternet() #root = self.fromLocal() def fromInternet(self): self.crawler = crawlWithGeometry.Crawler() self.crawler.crawlSinglePage('http://www.19lou.com/forum-72-1.html', self.loadComplete) def initUI(self,root): ptl = root.findall('.//*[@predict!="0"]') print('num of predicted pl:', len(ptl)) #ana = Analyser() #print(ana.performAnalyse(root)) self.visualizer = WebWireFrameVisualizer(root, self.toolbox, self) layout = QtGui.QHBoxLayout() #self.visualizer.setGeometry(50, 30, self.extractor.totalwidth, self.extractor.totalheight) layout.addWidget(self.visualizer) layout.setSizeConstraint(QtGui.QLayout.SetNoConstraint) layout.setStretchFactor(self.visualizer,1) frame = QtGui.QWidget(self) frame.setLayout(layout) frame.setGeometry(0,0,int(root.attrib['width'])+500,int(root.attrib['height'])+1000) frame.show() self.setWidget(frame) self.toolbox.show() pylab.show() def fromLocal(self): parser = etree.XMLParser(recover=True) # err file: 280,1640,9382, 6083, 11348, 10212, 11175, 1459, 5722 tree = etree.parse('../data/pages_crawled1125/with_label_120314/18.txt.xml',parser=parser) print('fid:',tree.getroot().attrib['fid']) root = tree.find('//html') #self.process(root) self.initUI(root) return root def loadComplete(self,htmlstring): print('loadComplete') root = html.fromstring(htmlstring) cleaner = cleanpage.PageCleaner() root = cleaner.clean(root) #print(html.tostring(tree,encoding='unicode',method='xml',pretty_print=True)) self.process(root) self.initUI(root) return root def process(self,root): se = StructureExtractor() se.drawFeature(root) self.extractor = Extractor(root) self.extractor.process() #self.crossP = self.extractor.string2sparse(self.htmlnode.attrib['crossP'],self.extractor.totalheight+1) Config.init() detector = Detector()#Config.nbTLstr) detector.detect(root) self.toolbox.setDetector(detector)
def train(self, xmlfile_list): parser = etree.XMLParser(recover=True) # model for title line confirmation, using naive bayes ptagid = [{},{}] ppos = [[0,0],[0,0]] plen = [[],[]] psize = [[],[]] pleft = [[],[]] pwidth = [[],[]] pheight = [[],[]] for xmlfile in xmlfile_list: print(xmlfile) root = etree.parse(xmlfile,parser).getroot() htmlnode = root[0] se = StructureExtractor() se.drawFeature(htmlnode) extractor = Extractor(htmlnode) extractor.process() self.detect(htmlnode) for node in htmlnode.findall('.//*[@predict="{}"]'.format(LABEL['TITLE_LINE'])): if 'label' in node.attrib and node.attrib['label'] == LABEL['TITLE_LINE']: # tagid feature: discrete ptagid[1][node.tag] = ptagid[1].get(node.tag,0) + 1 # position: first or not first if node.attrib['position'] == '0': ppos[1][0] += 1 else: ppos[1][1] += 1 # length: Gaussian plen[1].append(int(node.attrib['length'])) # size: Gaussian psize[1].append(int(node.attrib['size'])) # left: Gaussian pleft[1].append(int(node.attrib['left'])) # width: Gaussian pwidth[1].append(int(node.attrib['width'])) # height: Gaussian pheight[1].append(int(node.attrib['height'])) else: ptagid[0][node.tag] = ptagid[0].get(node.tag,0) + 1 if node.attrib['position'] == '0': ppos[0][0] += 1 else: ppos[0][1] += 1 plen[0].append(int(node.attrib['length'])) psize[0].append(int(node.attrib['size'])) pleft[0].append(int(node.attrib['left'])) pwidth[0].append(int(node.attrib['width'])) pheight[0].append(int(node.attrib['height'])) # tagid feature: discrete tsum = sum([ptagid[0][k] for k in ptagid[0]]) self.mtagid = [] self.mtagid.append({k:ptagid[0][k]/tsum for k in ptagid[0]}) tsum = sum([ptagid[1][k] for k in ptagid[1]]) self.mtagid.append({k:ptagid[1][k]/tsum for k in ptagid[1]}) # position: first or not first self.mpos = [[ppos[0][0]/sum(ppos[0]), ppos[0][1]/sum(ppos[0])], [ppos[1][0]/sum(ppos[1]), ppos[1][1]/sum(ppos[1])]] # length: Gaussian arr0 = scipy.array(plen[0]) arr1 = scipy.array(plen[1]) self.mlen=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # size: Gaussian arr0 = scipy.array(psize[0]) arr1 = scipy.array(psize[1]) self.msize=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # left: Gaussian arr0 = scipy.array(pleft[0]) arr1 = scipy.array(pleft[1]) self.mleft=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # width: Gaussian arr0 = scipy.array(pwidth[0]) arr1 = scipy.array(pwidth[1]) self.mwidth=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # height: Gaussian arr0 = scipy.array(pheight[0]) arr1 = scipy.array(pheight[1]) self.mheight=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] return (self.mtagid, self.mpos, self.mlen, self.msize, self.mleft, self.mwidth, self.mheight)