def all(self, dom, marker): tables = dom.getElementsByTagName('table') tree = Node().loadNodeTree(dom,0) itables = [] self.tDfs(tree,itables) for table in itables: c = 0 table.result = match(table, self.maxDist,self.height, self.tags) tr, td, t = self.count_table(table) #print table.result d = {} for o in table.result: if o: d.setdefault(o,0) d[o]+=1 for i in d: c += d[i] pred = 0 if c >= 2: if tr > 0 and td/float(tr) > 1: pred = 1 self.c(table, pred) marker.mark(table.dom, 'table')
def dfs(self, node, maxDist, height, tags): """ Navega em profundidade na árvore buscando nós adjacentes com distancia de edição menor que maxDist e os agrupa na mesma componente. @param Node node: No atual da dfs @param int esp: Nível da árvore @param float maxDist: Proporção máxima de diferença para ser agrupado na mesma componente """ node.result = match(node, maxDist, height, tags) for x in xrange(0,len(node.childNodes)): if not node.result[x]: self.dfs(node.childNodes[x], maxDist, height, tags)
def _mark2(self, dom, marker, postProcess=False): tables = dom.getElementsByTagName('table') tree = Node().loadNodeTree(dom,0) itables = [] self.tDfs(tree,itables) #print 'tables', len(tables) #print 'itables', len(itables) for table in itables: p = False if postProcess: (tr,td, t) = self.count_tr_td(table) else: (tr, td) = 0,1 if tr > 0 and td/float(tr) > 1: p = True if p or not postProcess: table.result = match(table, self.maxDist,self.height, self.tags) #print table.result d = {} for o in table.result: if o: d.setdefault(o,0) d[o]+=1 c = 0 for i in d: c += d[i] if postProcess: if c >= 2: print 'mark', td/float(tr) marker.mark(table.dom,'table') else: if c >= 2: marker.mark(table.dom,'table')