Esempio n. 1
0
    def getFeatures(self):  
        def searchAdd(list, data):
            if data in list:
                return
            list.append(data)
            
        children = self.root.children()
        
        containtags = []
        for i in range(len(children)):
            child = children.eq(i)
            _tag = getTagName(child)
            #print '_tag: ', _tag
            if _tag: searchAdd(containtags, _tag)
            if _tag in nodenames:
                #add special tag nodes as feature
                data = str(child)
                #fix pq bug: mistakenly treat "<img>hello" as a node
                #split tag and text data
                _end = data.rfind('>')
                yield trim(data[:_end+1])
                _res = trim(data[_end+1:])
                if _res: yield _res
        #print 'containtags: ', containtags

        #remove all tags
        #print 'containtags', containtags
        for t in containtags: self.root.remove(t)
        #words as features
        text = self.root.text()
        for word in [trim(w) for w in wordsplit.split(text)]:
            if word: yield word
Esempio n. 2
0
 def setNode(self, node):
     self.source = str(node)
     if trim(self.source):
         tag = getTagName(self.source)
         if not tag: return
         self.root = pq(self.source)(tag)
         #clean source
         self.root.remove('script')
         self.root.remove('style')
         return True
     return False
Esempio n. 3
0
 def addStyleNode(node):
     #print 'addStyleNode(%s)'% node
     #clean node
     childnodes = node.children()
     stylenode = StyleNode(self.dic)
     assert node != None , "addStyleNode(None)"
     stylenode.generateStyleNode(node)
     _stylenode = element.registerStyleNode(stylenode)
     j = -1
     for i in range(len(childnodes)):
         child = childnodes.eq(i)
         tag = getTagName(child)
         #print '** tag:', tag
         if tag not in nodenames:
             j += 1
             childnode = _stylenode.getChild(j)
             self.stack.push([ childnodes.eq(i), childnode ])
Esempio n. 4
0

if __name__ == '__main__':
    strr = '''
        <div id="nav">
            plain text 
            <a href=#>hello world</a>
            <a href=#>hello world</a>
            <a href=#>hello world</a>
            <p> tex in p </p>
            <img src='hello'/>after img
            <script src="hello">alert('ehllo'); </script>
        </div>
    '''
    #print strr
    print 'tagname: %s' % getTagName(strr)
    d = DatatagExtractor()
    #strr = open('./test/2').read()
    d.setNode(strr)
    data = d.getFeatures()
    '''
    for da in data:
        print da

    print '-'*50
    print '-'*50
    print '-'*50
    '''

    strr = '''
    <title>hello</title>