Python tagextract Examples, htmldata.tagextract Python Examples

Example #1

0

Show file

File: mw2html.py Project: weetmuts/rodin-b-sharp-rodincore

def post_html_transform(doc, url, config):
    """
  User-customizable HTML transform.

  Given an HTML document (with URLs already rewritten), returns
  modified HTML document.
  """
    if config.hack_skin:
        if config.skin == MONOBOOK_SKIN:
            doc = monobook_fix_html_sidebar(doc, config)
            doc = monobook_hack_skin_html(doc, config)
        else:
            raise ValueError('unknown skin')
    if config.move_href:
        doc = fix_move_href_tags(doc, config)
    if config.remove_history:
        doc = html_remove_image_history(doc, config)

    #remove some HTML
    L = htmldata.tagextract(doc)
    R = []
    prev = None
    skip = False
    for item in L:
        #logging.debug("item %s"% repr(item))
        # remove edit links
        #    if isinstance(item, tuple) and item[0] == 'span' \
        #        and 'class' in item[1] and item[1]['class'] == 'editsection':
        #      R.append(item)
        #      skip=True
        #    elif skip == True and not(isinstance(item, tuple) and item[0] == '/span'):
        #      logging.debug("Remove item %s " % repr(item))
        #    elif skip == True and isinstance(item, tuple) and item[0] == '/span':
        #      R.append('&nbsp;') #necessary to prevent eclipse ruiningthe html when
        #                         #inserting anchors
        #      R.append(item)
        #      skip=False
        # remove empty div
        if isinstance(item, tuple) and item[0] == 'div':
            if prev != None:
                R.append(prev)
            prev = item
        elif prev != None:
            if isinstance(item, tuple) and item[0] == '/div':
                logging.debug("Remove item %s %s" % (repr(prev), repr(item)))
                R.append(('<!-- %s %s -->' % (repr(prev), repr(item))))
                prev = None
            else:
                R.append(prev)
                R.append(item)
                prev = None
        else:
            R.append(item)

    doc = htmldata.tagjoin(R)
    return doc

Example #2

0

Show file

File: clean.py Project: eventohk/News-Aggregator

def safe_html(data):
    """
    Remove all tag attributes from html except a.href and img.src
    """
    
    data = normalize_html(data)
    tree = htmldata.tagextract(data)
    for elem in tree:
        if isinstance(elem, tuple):
            for attr in elem[1].keys():
                if 'a' == elem[0] and 'href' == attr:
                    continue
                if 'img/' == elem[0] and 'src' == attr:
                    continue
                del elem[1][attr]
    return htmldata.tagjoin(tree)

Example #3

0

Show file

File: clean.py Project: vidarmagnusson/FrodiCleaned

def safe_html(data):
    """
    Remove all tag attributes from html except a.href and img.src
    """
    
    data = normalize_html(data)
    tree = htmldata.tagextract(data)
    for elem in tree:
        if isinstance(elem, tuple):
            for attr in elem[1].keys():
                if 'a' == elem[0] and 'href' == attr:
                    continue
                if 'img/' == elem[0] and 'src' == attr:
                    continue
                del elem[1][attr]
    data = htmldata.tagjoin(tree)
    # Temporary hack
    # htmldata doing something shitty with html:
    # tagjoin return invalid DIV
    # Data for testing: http://py-algorithm.blogspot.com/2011/04/blog-post_3267.html
    data = normalize_html(data)
    return data

Example #4

0

Show file

def fix_move_href_tags(doc):
    """
  Return copy of doc with all MOVE_HREF tags removed.
  """
    while "<" + MOVE_HREF in doc:
        i1 = doc.index("<" + MOVE_HREF)
        i2 = doc.index("</" + MOVE_HREF, i1 + 1)
        i3 = doc.index(">", i2 + 1)
        (start, end) = (i1, i3 + 1)
        tags = htmldata.tagextract(doc[start:end])
        assert tags[0][0] == MOVE_HREF
        assert tags[-1][0] == "/" + MOVE_HREF
        href = tags[0][1].get("href", "")
        new_tags = []
        for tag in tags[1:-1]:
            if len(tag) == 2:
                if "href" in tag[1]:
                    if href == "":
                        continue
                    tag[1]["href"] = href
            new_tags += [tag]
        doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
    return doc

Example #5

0

Show file

File: mw2html.py Project: weetmuts/rodin-b-sharp-rodincore

def fix_move_href_tags(doc, config):
    """
  Return copy of doc with all MOVE_HREF tags removed.
  """
    while '<' + MOVE_HREF in doc:
        i1 = doc.index('<' + MOVE_HREF)
        i2 = doc.index('</' + MOVE_HREF, i1 + 1)
        i3 = doc.index('>', i2 + 1)
        (start, end) = (i1, i3 + 1)
        tags = htmldata.tagextract(doc[start:end])
        assert tags[0][0] == MOVE_HREF
        assert tags[-1][0] == '/' + MOVE_HREF
        href = tags[0][1].get('href', '')
        new_tags = []
        for tag in tags[1:-1]:
            if len(tag) == 2:
                if 'href' in tag[1]:
                    if href == '':
                        continue
                    tag[1]['href'] = href
            new_tags += [tag]
        doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
    return doc

Example #6

0

Show file

File: mw2html.py Project: Angeldude/audacity

def fix_move_href_tags(doc):
    """
    Return copy of doc with all MOVE_HREF tags removed.
    """
    while '<' + MOVE_HREF in doc:
        i1 = doc.index('<' + MOVE_HREF)
        i2 = doc.index('</' + MOVE_HREF, i1 + 1)
        i3 = doc.index('>', i2 + 1)
        (start, end) = (i1, i3 + 1)
        tags = htmldata.tagextract(doc[start:end])
        assert tags[0][0] == MOVE_HREF
        assert tags[-1][0] == '/' + MOVE_HREF
        href = tags[0][1].get('href', '')
        new_tags = []
        for tag in tags[1:-1]:
            if len(tag) == 2:
                if 'href' in tag[1]:
                    if href == '':
                        continue
                    tag[1]['href'] = href
            new_tags += [tag]
        doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
    return doc