Example #1
0
    'type',
    'width',
    'face',
    'size',  # font tags
    'flashvars',  # Not sure about flashvars - if any harm can come from it
    'classid',  # FF needs the classid on object tags for flash
    'name',
    'value',
    'quality',
    'data',
    'scale',  # for flash embed param tags, could limit to just param if this is harmful
    'salign',
    'align',
    'wmode',
))  # Bad attributes: 'allowscriptaccess', 'xmlns', 'target'
scrubber.normalized_tag_replacements = {'b': 'strong', 'i': 'em'}
# any giveaway classes the definately identify a footer.
footer_classes = ['gmail_quote', 'moz-signature']
# used for "on the 25th of DEC so and so wrote:" style patterns
dates = [
    'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT',
    'NOV', 'DEC'
]
# Giveaway email reply strings
email_heads = ['FROM', 'TO', 'SUBJECT']
# we can boost certainty of a signoffNodes significantly by looking for common one-liners like "cheers" <- a word many people say online but not IRL?
common_signoffs = [
    'thanks', 'cheers', 'thankyou'
    'thank you', 'regards', 'sincerely'
]
# When we are trying to detect a line, we need to know if an element is inline. these are the ones we are likely to encounter in an email (no textarea) (:
    'table', 'tbody', 'td', 'th', 'thead', 'tr', 'tt', 'ul', 'u',
    'var', 'wbr',
))
scrubber.disallowed_tags_save_content = set((
    'blink', 'body', 'html','font',
))
scrubber.allowed_attributes = set((
    'align', 'alt', 'border', 'cite', 'dir',
    'height', 'href', 'src', 'title', 'type', 'width',
    'face', 'size', # font tags
    'flashvars', # Not sure about flashvars - if any harm can come from it
    'classid', # FF needs the classid on object tags for flash
    'name', 'value', 'quality', 'data', 'scale', # for flash embed param tags, could limit to just param if this is harmful
    'salign', 'align', 'wmode',
)) # Bad attributes: 'allowscriptaccess', 'xmlns', 'target'
scrubber.normalized_tag_replacements = {'b': 'strong', 'i': 'em'}
# any giveaway classes the definately identify a footer.
footer_classes = ['gmail_quote', 'moz-signature']
# used for "on the 25th of DEC so and so wrote:" style patterns
dates = ['JAN', 'FEB' , 'MAR', 'APR' , 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
# Giveaway email reply strings
email_heads = ['FROM', 'TO', 'SUBJECT']
# we can boost certainty of a signoffNodes significantly by looking for common one-liners like "cheers" <- a word many people say online but not IRL?
common_signoffs = ['thanks', 'cheers', 'thankyou' 'thank you', 'regards', 'sincerely']
# When we are trying to detect a line, we need to know if an element is inline. these are the ones we are likely to encounter in an email (no textarea) (:
inline_elements = ['A','ABBR','ACRONYM','B','BIG','CITE','CODE','EM','FONT','I','IMG','SMALL','SPAN','STRIKE','STRONG','SUB','SUP','U',]

def split_html_message(messageText, tolerance = {
                                      'comment'        : 3,
                                      'seek'           : 10,
                                      'words_per_line' : 3,