Esempio n. 1
0
 def __init__(self, dic):
     self.styletree = StyleTree(dic)
     self.stack = Stack()
     self.datatagextractor = datatagextractor.DatatagExtractor()
     # centra dic
     self.dic = Dic()
     self.dic.fromfile()
Esempio n. 2
0
class SourceParser:
    '''
    parse html source and add nodes to styletree
    '''
    def __init__(self, dic):
        self.styletree = StyleTree(dic)
        self.stack = Stack()
        self.datatagextractor = datatagextractor.DatatagExtractor()
        # centra dic
        self.dic = Dic()
        self.dic.fromfile()

    def setSource(self, source):
        self.pq = pq(source)
        '''
        self.pq.remove('script')
        self.pq.remove('style')
        '''

    def setPagenum(self, num):
        self.styletree.setPagenum(num)

    def parse(self):
        body = self.pq('body')
        #init node
        self.stack.init()
        self.stack.push(
            [
                body,
                self.styletree.body
            ]
        )
        self.parseIter()

    def parseIter(self):

        def addDataNode(fnode, element):
            '''
            add datanodes
            first build a data container (DataNode)
            then for each data, create a Data and register in DataNode
            '''
            #print 'addDataNode'
            children = fnode.children()
            #print 'fnode.children: ', children
            #dics
            dn = element.getDataNode()
            self.datatagextractor.setNode(str(fnode))
            features = self.datatagextractor.getFeatures()
            #for each data add to DataNode
            dn.addFeatures(features)

        def addStyleNode(node):
            #print 'addStyleNode(%s)'% node
            #clean node
            childnodes = node.children()
            stylenode = StyleNode(self.dic)
            assert node != None , "addStyleNode(None)"
            stylenode.generateStyleNode(node)
            _stylenode = element.registerStyleNode(stylenode)
            j = -1
            for i in range(len(childnodes)):
                child = childnodes.eq(i)
                tag = getTagName(child)
                #print '** tag:', tag
                if tag not in nodenames:
                    j += 1
                    childnode = _stylenode.getChild(j)
                    self.stack.push([ childnodes.eq(i), childnode ])
                
        while not self.stack.empty():
            (node , element) = self.stack.pop()
            #print '.. stylenode: ', _stylenode
            addDataNode(node, element)
            addStyleNode(node)

    def _getTag(self, node):
        end = str(node).index('>')
        res = str(node)[:end+1]
        #print 'getTag: ', res
        return res