def getFeatures(self): def searchAdd(list, data): if data in list: return list.append(data) children = self.root.children() containtags = [] for i in range(len(children)): child = children.eq(i) _tag = getTagName(child) #print '_tag: ', _tag if _tag: searchAdd(containtags, _tag) if _tag in nodenames: #add special tag nodes as feature data = str(child) #fix pq bug: mistakenly treat "<img>hello" as a node #split tag and text data _end = data.rfind('>') yield trim(data[:_end+1]) _res = trim(data[_end+1:]) if _res: yield _res #print 'containtags: ', containtags #remove all tags #print 'containtags', containtags for t in containtags: self.root.remove(t) #words as features text = self.root.text() for word in [trim(w) for w in wordsplit.split(text)]: if word: yield word
def setNode(self, node): self.source = str(node) if trim(self.source): tag = getTagName(self.source) if not tag: return self.root = pq(self.source)(tag) #clean source self.root.remove('script') self.root.remove('style') return True return False
def addStyleNode(node): #print 'addStyleNode(%s)'% node #clean node childnodes = node.children() stylenode = StyleNode(self.dic) assert node != None , "addStyleNode(None)" stylenode.generateStyleNode(node) _stylenode = element.registerStyleNode(stylenode) j = -1 for i in range(len(childnodes)): child = childnodes.eq(i) tag = getTagName(child) #print '** tag:', tag if tag not in nodenames: j += 1 childnode = _stylenode.getChild(j) self.stack.push([ childnodes.eq(i), childnode ])
if __name__ == '__main__': strr = ''' <div id="nav"> plain text <a href=#>hello world</a> <a href=#>hello world</a> <a href=#>hello world</a> <p> tex in p </p> <img src='hello'/>after img <script src="hello">alert('ehllo'); </script> </div> ''' #print strr print 'tagname: %s' % getTagName(strr) d = DatatagExtractor() #strr = open('./test/2').read() d.setNode(strr) data = d.getFeatures() ''' for da in data: print da print '-'*50 print '-'*50 print '-'*50 ''' strr = ''' <title>hello</title>