def post_html_transform(doc, url, config): """ User-customizable HTML transform. Given an HTML document (with URLs already rewritten), returns modified HTML document. """ if config.hack_skin: if config.skin == MONOBOOK_SKIN: doc = monobook_fix_html_sidebar(doc, config) doc = monobook_hack_skin_html(doc, config) else: raise ValueError('unknown skin') if config.move_href: doc = fix_move_href_tags(doc, config) if config.remove_history: doc = html_remove_image_history(doc, config) #remove some HTML L = htmldata.tagextract(doc) R = [] prev = None skip = False for item in L: #logging.debug("item %s"% repr(item)) # remove edit links # if isinstance(item, tuple) and item[0] == 'span' \ # and 'class' in item[1] and item[1]['class'] == 'editsection': # R.append(item) # skip=True # elif skip == True and not(isinstance(item, tuple) and item[0] == '/span'): # logging.debug("Remove item %s " % repr(item)) # elif skip == True and isinstance(item, tuple) and item[0] == '/span': # R.append(' ') #necessary to prevent eclipse ruiningthe html when # #inserting anchors # R.append(item) # skip=False # remove empty div if isinstance(item, tuple) and item[0] == 'div': if prev != None: R.append(prev) prev = item elif prev != None: if isinstance(item, tuple) and item[0] == '/div': logging.debug("Remove item %s %s" % (repr(prev), repr(item))) R.append(('<!-- %s %s -->' % (repr(prev), repr(item)))) prev = None else: R.append(prev) R.append(item) prev = None else: R.append(item) doc = htmldata.tagjoin(R) return doc
def safe_html(data): """ Remove all tag attributes from html except a.href and img.src """ data = normalize_html(data) tree = htmldata.tagextract(data) for elem in tree: if isinstance(elem, tuple): for attr in elem[1].keys(): if 'a' == elem[0] and 'href' == attr: continue if 'img/' == elem[0] and 'src' == attr: continue del elem[1][attr] return htmldata.tagjoin(tree)
def safe_html(data): """ Remove all tag attributes from html except a.href and img.src """ data = normalize_html(data) tree = htmldata.tagextract(data) for elem in tree: if isinstance(elem, tuple): for attr in elem[1].keys(): if 'a' == elem[0] and 'href' == attr: continue if 'img/' == elem[0] and 'src' == attr: continue del elem[1][attr] data = htmldata.tagjoin(tree) # Temporary hack # htmldata doing something shitty with html: # tagjoin return invalid DIV # Data for testing: http://py-algorithm.blogspot.com/2011/04/blog-post_3267.html data = normalize_html(data) return data
def fix_move_href_tags(doc): """ Return copy of doc with all MOVE_HREF tags removed. """ while "<" + MOVE_HREF in doc: i1 = doc.index("<" + MOVE_HREF) i2 = doc.index("</" + MOVE_HREF, i1 + 1) i3 = doc.index(">", i2 + 1) (start, end) = (i1, i3 + 1) tags = htmldata.tagextract(doc[start:end]) assert tags[0][0] == MOVE_HREF assert tags[-1][0] == "/" + MOVE_HREF href = tags[0][1].get("href", "") new_tags = [] for tag in tags[1:-1]: if len(tag) == 2: if "href" in tag[1]: if href == "": continue tag[1]["href"] = href new_tags += [tag] doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:] return doc
def fix_move_href_tags(doc, config): """ Return copy of doc with all MOVE_HREF tags removed. """ while '<' + MOVE_HREF in doc: i1 = doc.index('<' + MOVE_HREF) i2 = doc.index('</' + MOVE_HREF, i1 + 1) i3 = doc.index('>', i2 + 1) (start, end) = (i1, i3 + 1) tags = htmldata.tagextract(doc[start:end]) assert tags[0][0] == MOVE_HREF assert tags[-1][0] == '/' + MOVE_HREF href = tags[0][1].get('href', '') new_tags = [] for tag in tags[1:-1]: if len(tag) == 2: if 'href' in tag[1]: if href == '': continue tag[1]['href'] = href new_tags += [tag] doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:] return doc
def fix_move_href_tags(doc): """ Return copy of doc with all MOVE_HREF tags removed. """ while '<' + MOVE_HREF in doc: i1 = doc.index('<' + MOVE_HREF) i2 = doc.index('</' + MOVE_HREF, i1 + 1) i3 = doc.index('>', i2 + 1) (start, end) = (i1, i3 + 1) tags = htmldata.tagextract(doc[start:end]) assert tags[0][0] == MOVE_HREF assert tags[-1][0] == '/' + MOVE_HREF href = tags[0][1].get('href', '') new_tags = [] for tag in tags[1:-1]: if len(tag) == 2: if 'href' in tag[1]: if href == '': continue tag[1]['href'] = href new_tags += [tag] doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:] return doc