Beispiel #1
0
def getcss(doc):

    def fullsplit(csstring):       
        for classname, cmds in cssplit1(csstring):
            subcss = dict()
            for cmd, value in cssplit2(cmds): subcss[cmd] = value
            cssdict[classname] = subcss
        
    import urllib    
    cssdict, count = dict(), 1
    for tag in tagsinList(doc, ['style', 'link']):
        if isname(tag, 'link'):
            if attget(tag, 'rel') == 'stylesheet':
                url = attget(tag, 'href')
                if url.find('http:') != -1: style = urllib.urlopen(url).read()
                else: style = open(urllib.url2pathname(url)).read()
                fullsplit(style)
        elif isname(tag, 'style'):
            for i in tag.childNodes:
                if istext(i) or iscomment(i): fullsplit(i.data)
    for tag in tagsbyAttr(doc, 'style'):
        subcss = dict()
        for cmd, value in cssplit2(attget(tag, 'style')): subcss[cmd] = value
        if subcss not in cssdict.values():
            cssdict[''.join(['.ecss', str(count)])] = subcss
            count += 1    
    return cssdict        
Beispiel #2
0
def fixspans(doc):
    tables = list(tags(doc, htmlns, 'table'))
    for table in tables:
        trcount, tdcount = 0, 0
        for tr in table.childNodes:
            if tr.localName == 'tr': trcount += 1
        for tr in table.childNodes:
            count = 0
            for td in tr.childNodes:
                if td.localName == 'td': count += 1
            if count > tdcount: tdcount = count
        for tr in table.childNodes:
            for td in tr.childNodes:
                if iselement(td):
                    if hasatt(td, 'rowspan'):
                        if int(attget(td, 'rowspan')) > trcount:
                            attset(td, 'rowspan', str(trcount))
                    elif hasatt(td, 'colspan'):
                        if int(attget(td, 'colspan')) > tdcount:
                            attset(td, 'colspan', str(tdcount))
        grid, temp = [], []
        for tr in table.childNodes:
            if iselement(tr) and isname(tr, 'tr'):
                if len(temp):
                    grid.append(temp)
                    temp = []
                for td in tr.childNodes:
                    if iselement(td) and isname(td, 'td'): temp.append(td)
        if len(temp): grid.append(temp)
        rowspans = list(tagsbyAttr(table, 'rowspan'))
        if len(rowspans):
            for td in rowspans:
                trs, sibs = [], ['td.parentNode']
                for x in range(int(attget(td, 'rowspan'))-1):
                    sibs.append('.nextSibling')
                    if eval(''.join(sibs)): trs.append(eval(''.join(sibs)))
                for nodelist in grid:
                    for i in nodelist:
                        if td in nodelist: loc = nodelist.index(td)
                    for tr in trs:
                        if tr.firstChild in nodelist: nodelist.insert(loc, td)
        for nodelist in grid:
            for td in nodelist:
                if hasatt(td, 'colspan'):
                    index = nodelist.index(td)
                    colspan = int(attget(td, 'colspan'))
                    offset, largest = len(nodelist[:index]), 0
                    for nlist in grid:
                        actual = len(nlist[index:-offset])
                        if colspan > actual:
                            if actual > largest:
                                count = 0
                                for i in nlist[index:-offset]:
                                    if hasatt(i, 'colspan'):
                                        count += int(attget(i, 'colspan'))
                                if count != colspan: largest = actual
                    if largest: attset(td, 'colspan', str(largest))
                    if int(attget(td, 'colspan')) <= 1: attdel(td, 'colspan')
Beispiel #3
0
def htmlgrid(doc, structure):
    grid, temp, blist = [], [], []
    for node in structure:
        if isname(node, 'tr'):
            if len(temp):
                if temp[0].parentNode in grid[-1]:
                    if temp not in grid: grid.append(temp)
                    grid.append([node])
                else:
                    grid.append([node])
                    if temp not in grid: grid.append(temp)
            else: grid.append([node])
            temp = []
        elif isname(node, 'td'):
            if node not in blist:
                temp.append(node)
                blist.append(node)
            sibs = ['node.nextSibling']
            for x in range(len(node.parentNode.childNodes)-1):
                neval = ''.join(sibs)
                if eval(neval):
                    if eval('.'.join([neval, 'localName'])) == 'td':
                        if eval(neval) not in blist:
                            if eval(neval) in structure:
                                temp.append(eval(neval))
                                blist.append(eval(neval))
                                sibs.append('.nextSibling')
        elif isname(node, 'table'):
            if len(temp):
                if temp not in grid: grid.append(temp)
            grid.append([node])
    if len(temp): grid.append(temp)
    rowspans = list(tagsbyAttr(doc, 'rowspan'))
    if len(rowspans):
        for td in rowspans:
            trs, sibs = [], ['td.parentNode']
            for x in range(int(attget(td, 'rowspan'))-1):
                sibs.append('.nextSibling')
                if eval(''.join(sibs)): trs.append(eval(''.join(sibs)))
            for nodelist in grid:
                for tr in trs:
                    if tr.firstChild in nodelist: nodelist.append(td)
    return grid