def escape_email_quoted_text(text, indent_txt='>>', linebreak_txt='\n'): """Escape text using an email-like indenting rule. As an example, this text: >>Brave Sir Robin ran away... <img src="malicious_script />*No!* >>bravely ran away away... I didn't!*<script>malicious code</script> >>When danger reared its ugly head, he bravely turned his tail and fled. <form onload="malicious"></form>*I never did!* will be escaped like this: >>Brave Sir Robin ran away... <img src="malicious_script />*No!* >>bravely ran away away... I didn't!*<script>malicious code</script> >>When danger reared its ugly head, he bravely turned his tail and fled. <form onload="malicious"></form>*I never did!* """ washer = HTMLWasher() lines = text.split(linebreak_txt) output = '' for line in lines: line = line.strip() nb_indent = 0 while True: if line.startswith(indent_txt): nb_indent += 1 line = line[len(indent_txt):] else: break output += (nb_indent * indent_txt) + washer.wash( line, render_unallowed_tags=True) + linebreak_txt nb_indent = 0 return output[:-1]
def escape_email_quoted_text(text, indent_txt='>>', linebreak_txt='\n'): """Escape text using an email-like indenting rule. As an example, this text: >>Brave Sir Robin ran away... <img src="malicious_script />*No!* >>bravely ran away away... I didn't!*<script>malicious code</script> >>When danger reared its ugly head, he bravely turned his tail and fled. <form onload="malicious"></form>*I never did!* will be escaped like this: >>Brave Sir Robin ran away... <img src="malicious_script />*No!* >>bravely ran away away... I didn't!*<script>malicious code</script> >>When danger reared its ugly head, he bravely turned his tail and fled. <form onload="malicious"></form>*I never did!* """ washer = HTMLWasher() lines = text.split(linebreak_txt) output = '' for line in lines: line = line.strip() nb_indent = 0 while True: if line.startswith(indent_txt): nb_indent += 1 line = line[len(indent_txt):] else: break output += (nb_indent * indent_txt) + washer.wash(line, render_unallowed_tags=True) + linebreak_txt nb_indent = 0 return output[:-1]
class XSSEscapingTest(unittest.TestCase): """Test functions related to the prevention of XSS attacks.""" def __init__(self, methodName='test'): self.washer = HTMLWasher() unittest.TestCase.__init__(self, methodName) def test_forbidden_formatting_tags(self): """htmlutils - washing of tags altering formatting of a page (e.g. </html>)""" test_str = """</html></body></pre>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '</html></body></pre>') def test_forbidden_script_tags(self): """htmlutils - washing of tags defining scripts (e.g. <script>)""" test_str = """<script>malicious_function();</script>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '<script>malicious_function();</script>') def test_forbidden_attributes(self): """htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)""" # onload test_str = """<p onload="javascript:malicious_functtion();">""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>') # tricky: css calling a javascript test_str = """<p style="background: url('http://malicious_site.com/malicious_script.js');">""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>') def test_fake_url(self): """htmlutils - washing of fake URLs which execute scripts""" test_str = """<a href="javascript:malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>') # Pirates could encode ascii values, or use uppercase letters... test_str = """<a href="javasCRipt:malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>') # MSIE treats 'java\ns\ncript:' the same way as 'javascript:' # Here we test with: # j # avas # crIPt : test_str = """<a href="j\n avas\n crIPt :malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>')
class XSSEscapingTest(InvenioTestCase): """Test functions related to the prevention of XSS attacks.""" def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName) def test_forbidden_formatting_tags(self): """htmlutils - washing of tags altering formatting of a page (e.g. </html>)""" test_str = """</html></body></pre>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '</html></body></pre>') def test_forbidden_script_tags(self): """htmlutils - washing of tags defining scripts (e.g. <script>)""" test_str = """<script>malicious_function();</script>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '<script>malicious_function();</script>') def test_forbidden_attributes(self): """htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)""" # onload test_str = """<p onload="javascript:malicious_functtion();">""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>') # tricky: css calling a javascript test_str = """<p style="background: url('http://malicious_site.com/malicious_script.js');">""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>') def test_fake_url(self): """htmlutils - washing of fake URLs which execute scripts""" test_str = """<a href="javascript:malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>') # Pirates could encode ascii values, or use uppercase letters... test_str = """<a href="javasCRipt:malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>') # MSIE treats 'java\ns\ncript:' the same way as 'javascript:' # Here we test with: # j # avas # crIPt : test_str = """<a href="j\n avas\n crIPt :malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>')
class HTMLAutomaticLinksTransformation(InvenioTestCase): """Test functions related to transforming links into HTML context""" def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName) def test_transform_link(self): """htmlutils - transforming a link""" body_input = 'https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es' body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es</a>' self.assertEqual( self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_transform_several_links(self): """htmlutils - transforming several links""" body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds.cern.ch/search?p=%27CERN+News' body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text <a href="https://cds.cern.ch/search?p=%27CERN">https://cds.cern.ch/search?p=%27CERN</a>+News' self.assertEqual( self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_transform_just_valid_links(self): """htmlutils - transforming just valid links""" body_input = body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds..cern/search?p=%27CERN+News' body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text https://cds..cern/search?p=%27CERN+News' self.assertEqual( self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_not_transform_link(self): """htmlutils - not transforming a link""" body_input = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>' body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>' self.assertEqual( self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected)
class HTMLAutomaticLinksTransformation(unittest.TestCase): """Test functions related to transforming links into HTML context""" def __init__(self, methodName='test'): self.washer = HTMLWasher() unittest.TestCase.__init__(self, methodName) def test_transform_link(self): """htmlutils - transforming a link""" body_input = 'https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es' body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es</a>' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_transform_several_links(self): """htmlutils - transforming several links""" body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds.cern.ch/search?p=%27CERN+News' body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text <a href="https://cds.cern.ch/search?p=%27CERN">https://cds.cern.ch/search?p=%27CERN</a>+News' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_transform_just_valid_links(self): """htmlutils - transforming just valid links""" body_input = body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds..cern/search?p=%27CERN+News' body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text https://cds..cern/search?p=%27CERN+News' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_not_transform_link(self): """htmlutils - not transforming a link""" body_input = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>' body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected)
def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '): """ Displays notes (various note fields) @param note_prefix: a prefix before each group of notes @param note_suffix: a suffix after each group of notes @param separator: a separator between notes of a group """ notes = [] washer = HTMLWasher() wash_and_join = lambda x: separator.join([washer.wash(item, automatic_link_transformation=True) for item in x]) # Get values from certain fields, wash them (so all links become clickable), # join using separator and add to a list if bfo.fields('500__a'): notes.append(wash_and_join(bfo.fields('500__a'))) if len(notes) > 0: # Split all list elements and add prefixes and suffixes notes = [note_prefix + x + note_suffix for x in notes] return_notes = "".join(notes) return return_notes
def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '): """ Displays notes (various note fields) @param note_prefix: a prefix before each group of notes @param note_suffix: a suffix after each group of notes @param separator: a separator between notes of a group """ notes = [] washer = HTMLWasher() wash_and_join = lambda x: separator.join( [washer.wash(item, automatic_link_transformation=True) for item in x]) # Get values from certain fields, wash them (so all links become clickable), # join using separator and add to a list if bfo.fields('500__a'): notes.append(wash_and_join(bfo.fields('500__a'))) if len(notes) > 0: # Split all list elements and add prefixes and suffixes notes = [note_prefix + x + note_suffix for x in notes] return_notes = "".join(notes) return return_notes
def format_element(bfo, note_suffix, note_prefix='Note: ', separator='; '): """ Displays notes (various note fields) @param note_prefix: a prefix before each group of notes @param note_suffix: a suffix after each group of notes @param separator: a separator between notes of a group """ notes = [] washer = HTMLWasher() # Get values from certain fields, wash them (so all links become clickable), # join using separator and add to a list for field in bfo.fields('500__a'): field = washer.wash(field.replace("&", "&"), automatic_link_transformation=True) notes.append(field) if len(notes) > 0: # Split all list elements and add prefixes and suffixes notes = [note_prefix + x + note_suffix for x in notes] return_notes = "".join(notes) return return_notes
class HTMLWashingTest(InvenioTestCase): """Test functions related to general washing of HTML source""" def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName) def test_wash_html(self): """htmlutils - washing HTML tags""" # Simple test case test_str = 'Spam and <b><blink>eggs</blink></b>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'Spam and <b>eggs</b>') # Show 'escaped' tags test_str = 'Spam and <b><blink>eggs</blink></b>' self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), 'Spam and <b><blink>eggs</blink></b>') # Keep entity and character references test_str = '<b> a < b > c </b> ÷' self.assertEqual(self.washer.wash(html_buffer=test_str), '<b> a < b > c </b> ÷') # Remove content of <script> tags test_str = '<script type="text/javacript">alert("foo")</script>bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') test_str = '<script type="text/javacript"><!--alert("foo")--></script>bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') # Remove content of <style> tags test_str = '<style>.myclass {color:#f00}</style><span class="myclass">styled text</span>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text') test_str = '<style><!-- .myclass {color:#f00} --></style><span class="myclass">styled text</span>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text')
class HTMLWashingTest(unittest.TestCase): """Test functions related to general washing of HTML source""" def __init__(self, methodName='test'): self.washer = HTMLWasher() unittest.TestCase.__init__(self, methodName) def test_wash_html(self): """htmlutils - washing HTML tags""" # Simple test case test_str = 'Spam and <b><blink>eggs</blink></b>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'Spam and <b>eggs</b>') # Show 'escaped' tags test_str = 'Spam and <b><blink>eggs</blink></b>' self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), 'Spam and <b><blink>eggs</blink></b>') # Keep entity and character references test_str = '<b> a < b > c </b> ÷' self.assertEqual(self.washer.wash(html_buffer=test_str), '<b> a < b > c </b> ÷') # Remove content of <script> tags test_str = '<script type="text/javacript">alert("foo")</script>bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') test_str = '<script type="text/javacript"><!--alert("foo")--></script>bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') # Remove content of <style> tags test_str = '<style>.myclass {color:#f00}</style><span class="myclass">styled text</span>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text') test_str = '<style><!-- .myclass {color:#f00} --></style><span class="myclass">styled text</span>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text')
def __init__(self, methodName='test'): self.washer = HTMLWasher() unittest.TestCase.__init__(self, methodName)
def email_quoted_txt2html(text, tabs_before=0, indent_txt='>>', linebreak_txt="\n", indent_html=('<div class="commentbox">', "</div>"), linebreak_html='<br/>', indent_block=True): """ Takes a typical mail quoted text, e.g.:: hello, you told me: >> Your mother was a hamster and your father smelt of elderberries I must tell you that I'm not convinced. Then in this discussion: >>>> Is there someone else up there we could talk to? >> No. Now, go away, or I shall taunt you a second time-a! I think we're not going to be friends! and return an html formatted output, e.g.:: hello,<br/> you told me:<br/> <div> Your mother was a hamster and your father smelt of elderberries </div> I must tell you that I'm not convinced. Then in this discussion: <div> <div> Is there someone else up there we could talk to? </div> No. Now, go away, or I shall taunt you a second time-a! </div> I think we're not going to be friends! The behaviour is different when C{indent_block} is C{True} or C{False}. When C{True} the when C{indent_html} is only added at each change of level of indentation, while it is added for each line when C{False}. For eg:: >> a >> b >>>> c would result in (if C{True}):: <div class="commentbox"> a<br/> b<br/> <div class="commentbox"> c<br/> </div> </div> or would be (if C{False}):: <div class="commentbox"> a</div><br/> <div class="commentbox"> b</div><br/> <div class="commentbox"><div class="commentbox"> c</div></div><br/> @param text: the text in quoted format @param tabs_before: number of tabulations before each line @param indent_txt: quote separator in email (default:'>>') @param linebreak_txt: line separator in email @param indent_html: tuple of (opening, closing) html tags. default: ('<div class="commentbox">', "</div>") @param linebreak_html: line separator in html (default: '<br/>') @param indent_block: if indentation should be done per 'block' i.e. only at changes of indentation level (+1, -1) or at each line. @return: string containing html formatted output """ washer = HTMLWasher() final_body = "" nb_indent = 0 text = text.strip('\n') lines = text.split(linebreak_txt) for line in lines: new_nb_indent = 0 while True: if line.startswith(indent_txt): new_nb_indent += 1 line = line[len(indent_txt):] else: break if indent_block: if (new_nb_indent > nb_indent): for dummy in range(nb_indent, new_nb_indent): final_body += tabs_before*"\t" + indent_html[0] + "\n" tabs_before += 1 elif (new_nb_indent < nb_indent): for dummy in range(new_nb_indent, nb_indent): tabs_before -= 1 final_body += (tabs_before)*"\t" + indent_html[1] + "\n" else: final_body += (tabs_before)*"\t" else: final_body += tabs_before*"\t" + new_nb_indent * indent_html[0] try: line = washer.wash(line) except HTMLParseError: # Line contained something like "foo<bar" line = cgi.escape(line) if indent_block: final_body += tabs_before*"\t" final_body += line if not indent_block: final_body += new_nb_indent * indent_html[1] final_body += linebreak_html + "\n" nb_indent = new_nb_indent if indent_block: for dummy in range(0, nb_indent): tabs_before -= 1 final_body += (tabs_before)*"\t" + "</div>\n" return final_body
def tmpl_pageheader(self, req, ln=CFG_SITE_LANG, headertitle="", description="", keywords="", userinfobox="", useractivities_menu="", adminactivities_menu="", navtrailbox="", pageheaderadd="", uid=0, secure_page_p=0, navmenuid="admin", metaheaderadd="", rssurl=CFG_BASE_URL + "/rss", body_css_classes=None): """Creates a page header Parameters: - 'ln' *string* - The language to display - 'headertitle' *string* - the title of the HTML page, not yet escaped for HTML - 'description' *string* - description goes to the metadata in the header of the HTML page, not yet escaped for HTML - 'keywords' *string* - keywords goes to the metadata in the header of the HTML page, not yet escaped for HTML - 'userinfobox' *string* - the HTML code for the user information box - 'useractivities_menu' *string* - the HTML code for the user activities menu - 'adminactivities_menu' *string* - the HTML code for the admin activities menu - 'navtrailbox' *string* - the HTML code for the navigation trail box - 'pageheaderadd' *string* - additional page header HTML code - 'uid' *int* - user ID - 'secure_page_p' *int* (0 or 1) - are we to use HTTPS friendly page elements or not? - 'navmenuid' *string* - the id of the navigation item to highlight for this page - 'metaheaderadd' *string* - list of further tags to add to the <HEAD></HEAD> part of the page - 'rssurl' *string* - the url of the RSS feed for this page - 'body_css_classes' *list* - list of classes to add to the body tag Output: - HTML code of the page headers """ # Including HEPData headers ( Ugly hack but no obvious way to avoid this ...) if CFG_INSPIRE_SITE: hepDataAdditions = """<script type="text/javascript" src="%s/js/hepdata.js"></script>""" \ % (CFG_BASE_URL, ) hepDataAdditions += """<link rel="stylesheet" href="%s/img/hepdata.css" type="text/css" />""" \ % (CFG_BASE_URL, ) else: hepDataAdditions = "" # load the right message language _ = gettext_set_language(ln) if body_css_classes is None: body_css_classes = [] body_css_classes.append(navmenuid) uri = req.unparsed_uri headerLinkbackTrackbackLink = '' if CFG_WEBLINKBACK_TRACKBACK_ENABLED: from invenio.weblinkback_templates import get_trackback_auto_discovery_tag # Embed a link in the header to subscribe trackbacks # TODO: This hack must be replaced with the introduction of the new web framework recordIndexInURI = uri.find('/' + CFG_SITE_RECORD + '/') # substring found --> offer trackback link in header if recordIndexInURI != -1: recid = uri[recordIndexInURI:len(uri)].split('/')[2].split( "?")[0] #recid might end with ? for journal records headerLinkbackTrackbackLink = get_trackback_auto_discovery_tag( recid) if CFG_WEBSTYLE_INSPECT_TEMPLATES: inspect_templates_message = ''' <table width="100%%" cellspacing="0" cellpadding="2" border="0"> <tr bgcolor="#aa0000"> <td width="100%%"> <font color="#ffffff"> <strong> <small> CFG_WEBSTYLE_INSPECT_TEMPLATES debugging mode is enabled. Please hover your mouse pointer over any region on the page to see which template function generated it. </small> </strong> </font> </td> </tr> </table> ''' else: inspect_templates_message = "" sitename = CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME) if headertitle == sitename: pageheadertitle = headertitle else: pageheadertitle = headertitle + ' - ' + sitename metabase = "" stripped_url = CFG_SITE_URL.replace("://", "") if not CFG_BASE_URL and '/' in stripped_url: metabase = "<base href='%s'>" % (CFG_SITE_URL, ) out = """\ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="%(ln_iso_639_a)s" xml:lang="%(ln_iso_639_a)s" xmlns:og="http://opengraphprotocol.org/schema/" > <head> <title>%(pageheadertitle)s</title> %(metabase)s <link rev="made" href="mailto:%(sitesupportemail)s" /> <link rel="stylesheet" href="%(cssurl)s/img/invenio%(cssskin)s.css" type="text/css" /> <!--[if lt IE 8]> <link rel="stylesheet" type="text/css" href="%(cssurl)s/img/invenio%(cssskin)s-ie7.css" /> <![endif]--> <!--[if gt IE 8]> <style type="text/css">div.restrictedflag {filter:none;}</style> <![endif]--> %(canonical_and_alternate_urls)s <!-- <link rel="alternate" type="application/rss+xml" title="%(sitename)s RSS" href="%(rssurl)s" /> --> <link rel="search" type="application/opensearchdescription+xml" href="%(siteurl)s/opensearchdescription" title="%(sitename)s" /> <link rel="unapi-server" type="application/xml" title="unAPI" href="%(unAPIurl)s" /> <link rel="icon" href="/img/favicon.ico" type="image/x-icon"> <link rel="shortcut icon" href="/img/favicon.ico" type="image/x-icon"> %(linkbackTrackbackLink)s <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta http-equiv="Content-Language" content="%(ln)s" /> <meta name="description" content="%(description)s" /> <meta name="keywords" content="%(keywords)s" /> <meta name="google-site-verification" content="mLqufkdPNxUHXFW4obCfN5NJXr4sD_SlnvsOla7RZAE" /> <meta name="msvalidate.01" content="EA9805F0F62E4FF22B98853713964B28" /> <script type="text/javascript" src="%(cssurl)s/js/jquery.min.js"></script> %(hepDataAdditions)s %(metaheaderadd)s </head> <body%(body_css_classes)s lang="%(ln_iso_639_a)s"%(rtl_direction)s> <div class="pageheader"> %(inspect_templates_message)s <!-- replaced page header --> <div class="headerlogo"> <table class="headerbox" cellspacing="0"> <tr> <td align="right" valign="top" colspan="12"> <div class="headerboxbodylogo"> <a href="%(cssurl)s/?ln=%(ln)s">SCOAP<sup>3</sup> Repository (βeta)</a> </div> </td> </tr> <tr style="background-color: #679A70;"> <td style="padding: 10px; font-size: medium; color: #FFF;"><a href="/" style="text-decoration: none; color: #FFF;">HOME</a> :: <a href="http://scoap3.org/" style="text-decoration: none; color: #FFF;">SCOAP<sup>3</sup></a> :: <a href="http://scoap3.org/scoap3-repository-help" style="text-decoration: none; color: #FFF;">HELP</a> :: <a href="http://scoap3.org/scoap3-repository" style="text-decoration: none; color: #FFF;">ABOUT</a></td> </tr> <!-- <tr class="menu"> <td class="headermoduleboxbodyblank"> </td> <td class="headermoduleboxbodyblank"> </td> <td class="headermoduleboxbody%(search_selected)s"> <a class="header%(search_selected)s" href="%(cssurl)s/?ln=%(ln)s">%(msg_search)s</a> </td> <td class="headermoduleboxbodyblank"> </td> <td class="headermoduleboxbody%(submit_selected)s"> <a class="header%(submit_selected)s" href="%(cssurl)s/submit?ln=%(ln)s">%(msg_submit)s</a> </td> <td class="headermoduleboxbodyblank"> </td> <td class="headermoduleboxbody%(personalize_selected)s"> %(useractivities)s </td> <td class="headermoduleboxbodyblank"> </td> <td class="headermoduleboxbody%(help_selected)s"> <a class="header%(help_selected)s" href="%(cssurl)s/help/%(langlink)s">%(msg_help)s</a> </td> %(adminactivities)s <td class="headermoduleboxbodyblanklast"> </td> </tr>--> </table> </div> <table class="navtrailbox"> <tr> <td class="navtrailboxbody"> %(navtrailbox)s </td> </tr> </table> <!-- end replaced page header --> %(pageheaderadd)s </div> """ % { 'metabase': metabase, 'rtl_direction': is_language_rtl(ln) and ' dir="rtl"' or '', 'siteurl': CFG_SITE_URL, 'sitesecureurl': CFG_SITE_SECURE_URL, 'canonical_and_alternate_urls': self.tmpl_canonical_and_alternate_urls(uri), 'cssurl': CFG_BASE_URL, 'cssskin': CFG_WEBSTYLE_TEMPLATE_SKIN != 'default' and '_' + CFG_WEBSTYLE_TEMPLATE_SKIN or '', 'rssurl': rssurl, 'ln': ln, 'ln_iso_639_a': ln.split('_', 1)[0], 'langlink': '?ln=' + ln, 'sitename': CFG_SITE_NAME_INTL.get(ln, CFG_SITE_NAME), 'pageheadertitle': HTMLWasher().wash(pageheadertitle), 'sitesupportemail': CFG_SITE_SUPPORT_EMAIL, 'description': cgi.escape(description, True), 'keywords': cgi.escape(keywords, True), 'metaheaderadd': metaheaderadd, 'userinfobox': userinfobox, 'navtrailbox': navtrailbox, 'useractivities': useractivities_menu, 'adminactivities': adminactivities_menu and ('<td class="headermoduleboxbodyblank"> </td><td class="headermoduleboxbody%(personalize_selected)s">%(adminactivities)s</td>' % \ {'personalize_selected': navmenuid.startswith('admin') and "selected" or "", 'adminactivities': adminactivities_menu}) or '<td class="headermoduleboxbodyblank"> </td>', 'pageheaderadd': pageheaderadd, 'body_css_classes': body_css_classes and ' class="%s"' % ' '.join(body_css_classes) or '', 'search_selected': navmenuid == 'search' and "selected" or "", 'submit_selected': navmenuid == 'submit' and "selected" or "", 'personalize_selected': navmenuid.startswith('your') and "selected" or "", 'help_selected': navmenuid == 'help' and "selected" or "", 'msg_search': _("Search"), 'msg_submit': _("Submit"), 'msg_personalize': _("Personalize"), 'msg_help': _("Help"), 'unAPIurl': cgi.escape('%s/unapi' % CFG_SITE_URL), 'linkbackTrackbackLink': headerLinkbackTrackbackLink, 'hepDataAdditions': hepDataAdditions, 'inspect_templates_message': inspect_templates_message } return out
def email_quoted_txt2html(text, tabs_before=0, indent_txt='>>', linebreak_txt="\n", indent_html=('<div class="commentbox">', "</div>"), linebreak_html='<br/>'): """ Takes a typical mail quoted text, e.g.: hello, you told me: >> Your mother was a hamster and your father smelt of elderberries I must tell you that I'm not convinced. Then in this discussion: >>>> Is there someone else up there we could talk to? >> No. Now, go away, or I shall taunt you a second time-a! I think we're not going to be friends! and return an html formatted output, e.g.: hello,<br/> you told me:<br/> <div> Your mother was a hamster and your father smelt of elderberries </div> I must tell you that I'm not convinced. Then in this discussion: <div> <div> Is there someone else up there we could talk to? </div> No. Now, go away, or I shall taunt you a second time-a! </div> I think we're not going to be friends! @param text: the text in quoted format @param tabs_before: number of tabulations before each line @param indent_txt: quote separator in email (default:'>>') @param linebreak_txt: line separator in email (default: '\n') @param indent_html: tuple of (opening, closing) html tags. default: ('<div class="commentbox">', "</div>") @param linebreak_html: line separator in html (default: '<br/>') @return: string containing html formatted output """ washer = HTMLWasher() final_body = "" nb_indent = 0 text = text.strip('\n') lines = text.split(linebreak_txt) for line in lines: new_nb_indent = 0 while True: if line.startswith(indent_txt): new_nb_indent += 1 line = line[len(indent_txt):] else: break if (new_nb_indent > nb_indent): for dummy in range(nb_indent, new_nb_indent): final_body += tabs_before * "\t" + indent_html[0] + "\n" tabs_before += 1 elif (new_nb_indent < nb_indent): for dummy in range(new_nb_indent, nb_indent): tabs_before -= 1 final_body += (tabs_before) * "\t" + indent_html[1] + "\n" else: final_body += (tabs_before) * "\t" line = washer.wash(line) final_body += tabs_before * "\t" + line final_body += linebreak_html + "\n" nb_indent = new_nb_indent for dummy in range(0, nb_indent): tabs_before -= 1 final_body += (tabs_before) * "\t" + "</div>\n" return final_body
def email_quoted_txt2html(text, tabs_before=0, indent_txt='>>', linebreak_txt="\n", indent_html=('<div class="commentbox">', "</div>"), linebreak_html='<br/>'): """ Takes a typical mail quoted text, e.g.: hello, you told me: >> Your mother was a hamster and your father smelt of elderberries I must tell you that I'm not convinced. Then in this discussion: >>>> Is there someone else up there we could talk to? >> No. Now, go away, or I shall taunt you a second time-a! I think we're not going to be friends! and return an html formatted output, e.g.: hello,<br/> you told me:<br/> <div> Your mother was a hamster and your father smelt of elderberries </div> I must tell you that I'm not convinced. Then in this discussion: <div> <div> Is there someone else up there we could talk to? </div> No. Now, go away, or I shall taunt you a second time-a! </div> I think we're not going to be friends! @param text: the text in quoted format @param tabs_before: number of tabulations before each line @param indent_txt: quote separator in email (default:'>>') @param linebreak_txt: line separator in email (default: '\n') @param indent_html: tuple of (opening, closing) html tags. default: ('<div class="commentbox">', "</div>") @param linebreak_html: line separator in html (default: '<br/>') @return: string containing html formatted output """ washer = HTMLWasher() final_body = "" nb_indent = 0 text = text.strip('\n') lines = text.split(linebreak_txt) for line in lines: new_nb_indent = 0 while True: if line.startswith(indent_txt): new_nb_indent += 1 line = line[len(indent_txt):] else: break if (new_nb_indent > nb_indent): for dummy in range(nb_indent, new_nb_indent): final_body += tabs_before*"\t" + indent_html[0] + "\n" tabs_before += 1 elif (new_nb_indent < nb_indent): for dummy in range(new_nb_indent, nb_indent): tabs_before -= 1 final_body += (tabs_before)*"\t" + indent_html[1] + "\n" else: final_body += (tabs_before)*"\t" line = washer.wash(line) final_body += tabs_before*"\t" + line final_body += linebreak_html + "\n" nb_indent = new_nb_indent for dummy in range(0, nb_indent): tabs_before -= 1 final_body += (tabs_before)*"\t" + "</div>\n" return final_body
def _get_feature_text(record, language): """ Looks for a text (header) that can be featured on the article overview page. """ washer = HTMLWasher() header_text = "" # Check if there is a header if language == "fr": header = record.field('590__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('520__a') else: header = record.field('520__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('590__a') header = washer.wash(html_buffer=header, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) if header != "": header_text = header else: if language == "fr": article = record.fields('590__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('520__b') else: article = record.fields('520__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('590__b') try: article = article[0] except: return '' match_obj = re.search(header_pattern, article) if not match_obj: match_obj = re.search(header_pattern2, article) try: header_text = match_obj.group("header") header_text = washer.wash(html_buffer=header_text, allowed_tag_whitelist=['a'], allowed_attribute_whitelist=['href', 'target', 'class']) if header_text == "": raise Exception except: article = article.replace(header_text, '') article = article.replace('<p/>', '') article = article.replace('<p> </p>', '') match_obj = re.search(para_pattern, article) try: # get the first paragraph header_text = match_obj.group("paragraph") try: header_text = washer.wash(html_buffer=header_text, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results header_text = remove_html_markup(header_text) if header_text.strip() == "": raise Exception else: if len(header_text) > 250: header_text = _get_first_sentence_or_part(header_text) except: # in a last instance get the first sentence try: article = washer.wash(article, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results article = remove_html_markup(article) header_text = _get_first_sentence_or_part(article) return header_text
def email_quoted_txt2html( text, tabs_before=0, indent_txt=">>", linebreak_txt="\n", indent_html=('<div class="commentbox">', "</div>"), linebreak_html="<br/>", indent_block=True, ): """ Takes a typical mail quoted text, e.g.:: hello, you told me: >> Your mother was a hamster and your father smelt of elderberries I must tell you that I'm not convinced. Then in this discussion: >>>> Is there someone else up there we could talk to? >> No. Now, go away, or I shall taunt you a second time-a! I think we're not going to be friends! and return an html formatted output, e.g.:: hello,<br/> you told me:<br/> <div> Your mother was a hamster and your father smelt of elderberries </div> I must tell you that I'm not convinced. Then in this discussion: <div> <div> Is there someone else up there we could talk to? </div> No. Now, go away, or I shall taunt you a second time-a! </div> I think we're not going to be friends! The behaviour is different when C{indent_block} is C{True} or C{False}. When C{True} the when C{indent_html} is only added at each change of level of indentation, while it is added for each line when C{False}. For eg:: >> a >> b >>>> c would result in (if C{True}):: <div class="commentbox"> a<br/> b<br/> <div class="commentbox"> c<br/> </div> </div> or would be (if C{False}):: <div class="commentbox"> a</div><br/> <div class="commentbox"> b</div><br/> <div class="commentbox"><div class="commentbox"> c</div></div><br/> @param text: the text in quoted format @param tabs_before: number of tabulations before each line @param indent_txt: quote separator in email (default:'>>') @param linebreak_txt: line separator in email @param indent_html: tuple of (opening, closing) html tags. default: ('<div class="commentbox">', "</div>") @param linebreak_html: line separator in html (default: '<br/>') @param indent_block: if indentation should be done per 'block' i.e. only at changes of indentation level (+1, -1) or at each line. @return: string containing html formatted output """ washer = HTMLWasher() final_body = "" nb_indent = 0 text = text.strip("\n") lines = text.split(linebreak_txt) for line in lines: new_nb_indent = 0 while True: if line.startswith(indent_txt): new_nb_indent += 1 line = line[len(indent_txt) :] else: break if indent_block: if new_nb_indent > nb_indent: for dummy in range(nb_indent, new_nb_indent): final_body += tabs_before * "\t" + indent_html[0] + "\n" tabs_before += 1 elif new_nb_indent < nb_indent: for dummy in range(new_nb_indent, nb_indent): tabs_before -= 1 final_body += (tabs_before) * "\t" + indent_html[1] + "\n" else: final_body += (tabs_before) * "\t" else: final_body += tabs_before * "\t" + new_nb_indent * indent_html[0] try: line = washer.wash(line) except HTMLParseError: # Line contained something like "foo<bar" line = cgi.escape(line) if indent_block: final_body += tabs_before * "\t" final_body += line if not indent_block: final_body += new_nb_indent * indent_html[1] final_body += linebreak_html + "\n" nb_indent = new_nb_indent if indent_block: for dummy in range(0, nb_indent): tabs_before -= 1 final_body += (tabs_before) * "\t" + "</div>\n" return final_body
def format_element(bfo, separator='<br/>'): """ Display article body @param separator: separator between each body """ # Retrieve context (journal, issue and category) from URI args = parse_url_string(bfo.user_info['uri']) ln = args["ln"] _ = gettext_set_language(ln) if ln == "fr": article = bfo.fields('590__b') if not article or \ (len(article) == 1 and \ (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])): article = bfo.fields('520__b') else: article = bfo.fields('520__b') if not article or \ (len(article) == 1 and \ (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])): article = bfo.fields('590__b') if not CFG_CERN_SITE or \ not bfo.field('980__a').startswith('BULLETIN'): return separator.join(article) ################################################################ # CERN Bulletin-specific code # ################################################################ # We need a compatibility layer for old CERN Bulletin # articles. Identify them and process them if needed. is_old_cern_bulletin_article = False if bfo.field('980__a').startswith('BULLETIN'): try: year = int(bfo.fields('260__c')[0]) except IndexError: year = 2000 if year < 2009 or \ (bfo.field('980__a').startswith('BULLETINSTAFF') and \ ("CERN EDS" in bfo.field('595__a'))): is_old_cern_bulletin_article = True header_out = '' if not is_old_cern_bulletin_article: # Return the same as any other journal article return separator.join(article) # Old CERN articles if year < 2007 or bfo.field('980__a').startswith('BULLETINSTAFF'): # Really old CERN articles if len(article) > 0: # CERN-only: old CERN Bulletin articles return __backward_compatible_HTML(article[0]) + \ (bfo.field('980__a').startswith('BULLETINSTAFF') and \ ('<br/><br/>' + bfe_fulltext.format_element(bfo, style="", show_icons='yes')) \ or '') else: return '' # Not-so-old CERN articles follow: # 2. prepare regex's for the elements #===================================================== from invenio.webjournal_utils import \ image_pattern, \ para_pattern, \ header_pattern page_elements = {} # 3. get the header (either from marc xml or regex) #===================================================== if bfo.lang == "fr": header = bfo.field('590__a') if header == '': header = bfo.field('520__a') else: header = bfo.field('520__a') if header == '': header = bfo.field('590__a') if not header: try: header_obj = re.search(header_pattern, article[0]) header_text = header_obj.group("header") except: header_text = "" else: header_text = header washer = HTMLWasher() header_text_clean = washer.wash(html_buffer=header_text, allowed_tag_whitelist=['a'], allowed_attribute_whitelist=['href']) header_out = '<p class="articleHeader">' + header_text_clean + '</p>' # strip out all empty p tags and the header try: article = article[0].replace("<p/>", "") article = article.replace(header_text, "") article = article.replace(header_text_clean, "") except IndexError: article = "" image_iter = image_pattern.finditer(article) difference_from_original = 0 for image in image_iter: page_elements[image.start()] = {"link" : image.group("hyperlink"), "image" : image.group("image"), "caption" : image.group("caption")} # make sure we delete the image from the article (else might be used twice) start_index = image.span()[0] - difference_from_original end_index = image.span()[1] - difference_from_original article = article.replace(article[start_index:end_index], "") difference_from_original += image.span()[1] - image.span()[0] # replace <center> by <p><center> article = article.replace("<center>", "<p><center>") article = article.replace("</center>", "</center></p>") para_iter = para_pattern.finditer(article) for paragraph in para_iter: page_elements[paragraph.start()] = paragraph.group("paragraph") # TODO: find a way to do this inline in the dict ordered_keys = page_elements.keys() ordered_keys.sort() article_out = "" left_right_lever = True did_you_know_box = False for key in ordered_keys: if type(page_elements[key]) == types.DictType: if left_right_lever == True: article_out += '<div class="phrwithcaption"><div class="imageScale">' else: article_out += '<div class="phlwithcaption"><div class="imageScale">' if page_elements[key]["link"] != None: article_out += '<a href="' + page_elements[key]["link"] + '">' article_out += '<img class="featureImageScaleHolder" src="' + \ page_elements[key]["image"] + '" border="0" />' + \ '</a>' + \ '</div>' if page_elements[key]["caption"] != None: article_out += '<p>' + page_elements[key]["caption"] + \ '</p>' article_out += '</div>' elif type(page_elements[key]) == types.StringType: left_right_lever = not left_right_lever if (page_elements[key].lower().find("did you know") != -1) or \ (page_elements[key].lower().find("le saviez-vous ?") != -1): did_you_know_box = True continue if did_you_know_box == True: did_you_know_box = False article_out += __did_you_know_box(page_elements[key], left_right_lever, bfo.lang) continue article_out += '<p>' article_out += page_elements[key] article_out += '</p>' return header_out + article_out
def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName)