def _inner(value): if isinstance(value, six.string_types): washer = HTMLWasher() return washer.wash(value, allowed_tag_whitelist=allowed_tag_whitelist) else: return value
class XSSEscapingTest(InvenioTestCase): """Test functions related to the prevention of XSS attacks.""" def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName) def test_forbidden_formatting_tags(self): """htmlutils - washing of tags altering formatting of a page (e.g. </html>)""" test_str = """</html></body></pre>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '</html></body></pre>') def test_forbidden_script_tags(self): """htmlutils - washing of tags defining scripts (e.g. <script>)""" test_str = """<script>malicious_function();</script>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '') self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), '<script>malicious_function();</script>') def test_forbidden_attributes(self): """htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)""" # onload test_str = """<p onload="javascript:malicious_functtion();">""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>') # tricky: css calling a javascript test_str = """<p style="background: url('http://malicious_site.com/malicious_script.js');">""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>') def test_fake_url(self): """htmlutils - washing of fake URLs which execute scripts""" test_str = """<a href="javascript:malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>') # Pirates could encode ascii values, or use uppercase letters... test_str = """<a href="javasCRipt:malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>') # MSIE treats 'java\ns\ncript:' the same way as 'javascript:' # Here we test with: # j # avas # crIPt : test_str = """<a href="j\n avas\n crIPt :malicious_function();">link</a>""" self.assertEqual(self.washer.wash(html_buffer=test_str), '<a href="">link</a>')
class HTMLAutomaticLinksTransformation(InvenioTestCase): """Test functions related to transforming links into HTML context""" def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName) def test_transform_link(self): """htmlutils - transforming a link""" body_input = 'https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es' body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es</a>' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_transform_several_links(self): """htmlutils - transforming several links""" body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds.cern.ch/search?p=%27CERN+News' body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text <a href="https://cds.cern.ch/search?p=%27CERN">https://cds.cern.ch/search?p=%27CERN</a>+News' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_transform_just_valid_links(self): """htmlutils - transforming just valid links""" body_input = body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds..cern/search?p=%27CERN+News' body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text https://cds..cern/search?p=%27CERN+News' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected) def test_not_transform_link(self): """htmlutils - not transforming a link""" body_input = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>' body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>' self.assertEqual(self.washer.wash(html_buffer=body_input, automatic_link_transformation=True), body_expected)
class HTMLWashingTest(InvenioTestCase): """Test functions related to general washing of HTML source""" def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName) def test_wash_html(self): """htmlutils - washing HTML tags""" # Simple test case test_str = 'Spam and <b><blink>eggs</blink></b>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'Spam and <b>eggs</b>') # Show 'escaped' tags test_str = 'Spam and <b><blink>eggs</blink></b>' self.assertEqual(self.washer.wash(html_buffer=test_str, render_unallowed_tags=True), 'Spam and <b><blink>eggs</blink></b>') # Keep entity and character references test_str = '<b> a < b > c </b> ÷' self.assertEqual(self.washer.wash(html_buffer=test_str), '<b> a < b > c </b> ÷') # Remove content of <script> tags test_str = '<script type="text/javacript">alert("foo")</script>bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') test_str = '<script type="text/javacript"><!--alert("foo")--></script>bar' self.assertEqual(self.washer.wash(html_buffer=test_str), 'bar') # Remove content of <style> tags test_str = '<style>.myclass {color:#f00}</style><span class="myclass">styled text</span>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text') test_str = '<style><!-- .myclass {color:#f00} --></style><span class="myclass">styled text</span>' self.assertEqual(self.washer.wash(html_buffer=test_str), 'styled text')
def __init__(self, methodName='test'): self.washer = HTMLWasher() InvenioTestCase.__init__(self, methodName)
def format_element(bfo, separator='<br/>', max_chars=""): """ Display article body @param separator: separator between each body @param max_chars: if defined, limit the output to given char length """ ln = bfo.lang _ = gettext_set_language(ln) if ln == "fr": article = bfo.fields('590__b') if not article or \ (len(article) == 1 and \ (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])): article = bfo.fields('520__b') else: article = bfo.fields('520__b') if not article or \ (len(article) == 1 and \ (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])): article = bfo.fields('590__b') if not CFG_CERN_SITE or \ not bfo.field('980__a').startswith('BULLETIN'): output = separator.join(article) if max_chars.isdigit() and \ int(max_chars) > 0 and len(output) > int(max_chars): output = output[:int(max_chars)] + ' [...]' return output ################################################################ # CERN Bulletin-specific code # ################################################################ # We need a compatibility layer for old CERN Bulletin # articles. Identify them and process them if needed. is_old_cern_bulletin_article = False if bfo.field('980__a').startswith('BULLETIN'): try: year = int(bfo.fields('260__c')[0]) except IndexError: year = 2000 if year < 2009 or \ (bfo.field('980__a').startswith('BULLETINSTAFF') and \ ("CERN EDS" in bfo.field('595__a'))): is_old_cern_bulletin_article = True header_out = '' if not is_old_cern_bulletin_article: # Return the same as any other journal article output = separator.join(article) if max_chars.isdigit() and \ int(max_chars) > 0 and len(output) > int(max_chars): output = output[:int(max_chars)] + ' [...]' return output # Old CERN articles if year < 2007 or bfo.field('980__a').startswith('BULLETINSTAFF'): # Really old CERN articles if len(article) > 0: # CERN-only: old CERN Bulletin articles return __backward_compatible_HTML(article[0]) + \ (bfo.field('980__a').startswith('BULLETINSTAFF') and \ ('<br/><br/>' + bfe_fulltext.format_element(bfo, style="", show_icons='yes')) \ or '') else: return '' # Not-so-old CERN articles follow: # 2. prepare regex's for the elements #===================================================== from invenio.legacy.webjournal.utils import \ image_pattern, \ para_pattern, \ header_pattern page_elements = {} # 3. get the header (either from marc xml or regex) #===================================================== if bfo.lang == "fr": header = bfo.field('590__a') if header == '': header = bfo.field('520__a') else: header = bfo.field('520__a') if header == '': header = bfo.field('590__a') if not header: try: header_obj = re.search(header_pattern, article[0]) header_text = header_obj.group("header") except: header_text = "" else: header_text = header washer = HTMLWasher() header_text_clean = washer.wash(html_buffer=header_text, allowed_tag_whitelist=['a'], allowed_attribute_whitelist=['href']) header_out = '<p class="articleHeader">' + header_text_clean + '</p>' # strip out all empty p tags and the header try: article = article[0].replace("<p/>", "") article = article.replace(header_text, "") article = article.replace(header_text_clean, "") except IndexError: article = "" image_iter = image_pattern.finditer(article) difference_from_original = 0 for image in image_iter: page_elements[image.start()] = {"link" : image.group("hyperlink"), "image" : image.group("image"), "caption" : image.group("caption")} # make sure we delete the image from the article (else might be used twice) start_index = image.span()[0] - difference_from_original end_index = image.span()[1] - difference_from_original article = article.replace(article[start_index:end_index], "") difference_from_original += image.span()[1] - image.span()[0] # replace <center> by <p><center> article = article.replace("<center>", "<p><center>") article = article.replace("</center>", "</center></p>") para_iter = para_pattern.finditer(article) for paragraph in para_iter: page_elements[paragraph.start()] = paragraph.group("paragraph") # TODO: find a way to do this inline in the dict ordered_keys = page_elements.keys() ordered_keys.sort() article_out = "" left_right_lever = True did_you_know_box = False for key in ordered_keys: if type(page_elements[key]) == types.DictType: if left_right_lever == True: article_out += '<div class="phrwithcaption"><div class="imageScale">' else: article_out += '<div class="phlwithcaption"><div class="imageScale">' if page_elements[key]["link"] != None: article_out += '<a href="' + page_elements[key]["link"] + '">' article_out += '<img class="featureImageScaleHolder" src="' + \ page_elements[key]["image"] + '" border="0" />' + \ '</a>' + \ '</div>' if page_elements[key]["caption"] != None: article_out += '<p>' + page_elements[key]["caption"] + \ '</p>' article_out += '</div>' elif type(page_elements[key]) == types.StringType: left_right_lever = not left_right_lever if (page_elements[key].lower().find("did you know") != -1) or \ (page_elements[key].lower().find("le saviez-vous ?") != -1): did_you_know_box = True continue if did_you_know_box == True: did_you_know_box = False article_out += __did_you_know_box(page_elements[key], left_right_lever, bfo.lang) continue article_out += '<p>' article_out += page_elements[key] article_out += '</p>' output = header_out + article_out if max_chars.isdigit() and \ int(max_chars) > 0 and len(output) > int(max_chars): output = output[:int(max_chars)] + ' [...]' return output
def _get_feature_text(record, language): """ Looks for a text (header) that can be featured on the article overview page. """ washer = HTMLWasher() header_text = "" # Check if there is a header if language == "fr": header = record.field('590__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('520__a') else: header = record.field('520__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('590__a') header = washer.wash(html_buffer=header, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) if header != "": header_text = header else: if language == "fr": article = record.fields('590__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('520__b') else: article = record.fields('520__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('590__b') try: article = article[0] except: return '' match_obj = re.search(header_pattern, article) if not match_obj: match_obj = re.search(header_pattern2, article) try: header_text = match_obj.group("header") header_text = washer.wash(html_buffer=header_text, allowed_tag_whitelist=['a'], allowed_attribute_whitelist=['href', 'target', 'class']) if header_text == "": raise Exception except: article = article.replace(header_text, '') article = article.replace('<p/>', '') article = article.replace('<p> </p>', '') match_obj = re.search(para_pattern, article) try: # get the first paragraph header_text = match_obj.group("paragraph") try: header_text = washer.wash(html_buffer=header_text, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results header_text = remove_html_markup(header_text) if header_text.strip() == "": raise Exception else: if len(header_text) > 250: header_text = _get_first_sentence_or_part(header_text) except: # in a last instance get the first sentence try: article = washer.wash(article, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results article = remove_html_markup(article) header_text = _get_first_sentence_or_part(article) return header_text
def _get_feature_text(record, language): """ Looks for a text (header) that can be featured on the article overview page. """ washer = HTMLWasher() header_text = "" # Check if there is a header if language == "fr": header = record.field('590__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('520__a') else: header = record.field('520__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('590__a') header = washer.wash(html_buffer=header, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) if header != "": header_text = header else: if language == "fr": article = record.fields('590__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('520__b') else: article = record.fields('520__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('590__b') try: article = article[0] except: return '' match_obj = re.search(header_pattern, article) if not match_obj: match_obj = re.search(header_pattern2, article) try: header_text = match_obj.group("header") header_text = washer.wash( html_buffer=header_text, allowed_tag_whitelist=['a'], allowed_attribute_whitelist=['href', 'target', 'class']) if header_text == "": raise Exception except: article = article.replace(header_text, '') article = article.replace('<p/>', '') article = article.replace('<p> </p>', '') match_obj = re.search(para_pattern, article) try: # get the first paragraph header_text = match_obj.group("paragraph") try: header_text = washer.wash(html_buffer=header_text, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results header_text = remove_html_markup(header_text) if header_text.strip() == "": raise Exception else: if len(header_text) > 250: header_text = _get_first_sentence_or_part(header_text) except: # in a last instance get the first sentence try: article = washer.wash(article, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results article = remove_html_markup(article) header_text = _get_first_sentence_or_part(article) return header_text