Python HTMLWasher Exemples, invenio.utils.html.HTMLWasher Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : filter_utils.py Projet : IFCA/lifewatch_osf

 def _inner(value):
     if isinstance(value, six.string_types):
         washer = HTMLWasher()
         return washer.wash(value,
                            allowed_tag_whitelist=allowed_tag_whitelist)
     else:
         return value

Exemple #2

0

Afficher le fichier

Fichier : test_utils_html.py Projet : chokribr/invenio-1

class XSSEscapingTest(InvenioTestCase):
    """Test functions related to the prevention of XSS attacks."""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        InvenioTestCase.__init__(self, methodName)

    def test_forbidden_formatting_tags(self):
        """htmlutils - washing of tags altering formatting of a page (e.g. </html>)"""
        test_str = """</html></body></pre>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '')
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         '&lt;/html&gt;&lt;/body&gt;&lt;/pre&gt;')

    def test_forbidden_script_tags(self):
        """htmlutils - washing of tags defining scripts (e.g. <script>)"""
        test_str = """<script>malicious_function();</script>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '')
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         '&lt;script&gt;malicious_function();&lt;/script&gt;')

    def test_forbidden_attributes(self):
        """htmlutils - washing of forbidden attributes in allowed tags (e.g. onLoad)"""
        # onload
        test_str = """<p onload="javascript:malicious_functtion();">"""
        self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')
        # tricky: css calling a javascript
        test_str = """<p style="background: url('http://malicious_site.com/malicious_script.js');">"""
        self.assertEqual(self.washer.wash(html_buffer=test_str), '<p>')

    def test_fake_url(self):
        """htmlutils - washing of fake URLs which execute scripts"""
        test_str = """<a href="javascript:malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')
        # Pirates could encode ascii values, or use uppercase letters...
        test_str = """<a href="&#106;a&#118;asCRi&#112;t:malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')
        # MSIE treats 'java\ns\ncript:' the same way as 'javascript:'
        # Here we test with:
        # j
        #     avas
        #   crIPt :
        test_str = """<a href="&#106;\n    a&#118;as\n  crI&#80;t :malicious_function();">link</a>"""
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<a href="">link</a>')

Exemple #3

0

Afficher le fichier

Fichier : test_utils_html.py Projet : chokribr/invenio-1

class HTMLAutomaticLinksTransformation(InvenioTestCase):
    """Test functions related to transforming links into HTML context"""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        InvenioTestCase.__init__(self, methodName)

    def test_transform_link(self):
        """htmlutils - transforming a link"""
        body_input = 'https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es'
        body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es</a>'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

    def test_transform_several_links(self):
        """htmlutils - transforming several links"""
        body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds.cern.ch/search?p=%27CERN+News'
        body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text <a href="https://cds.cern.ch/search?p=%27CERN">https://cds.cern.ch/search?p=%27CERN</a>+News'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

    def test_transform_just_valid_links(self):
        """htmlutils - transforming just valid links"""
        body_input = body_input = 'some text https://cds.cern.ch/collection/Videos?ln=es more text https://cds..cern/search?p=%27CERN+News'
        body_expected = 'some text <a href="https://cds.cern.ch/collection/Videos?ln=es">https://cds.cern.ch/collection/Videos?ln=es</a> more text https://cds..cern/search?p=%27CERN+News'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

    def test_not_transform_link(self):
        """htmlutils - not transforming a link"""
        body_input = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
        body_expected = '<a href="https://cds.cern.ch/collection/Multimedia%20%26%20Outreach?ln=es">Multimedia</a>'
        self.assertEqual(self.washer.wash(html_buffer=body_input,
                                          automatic_link_transformation=True),
                         body_expected)

Exemple #4

0

Afficher le fichier

Fichier : test_utils_html.py Projet : chokribr/invenio-1

class HTMLWashingTest(InvenioTestCase):
    """Test functions related to general washing of HTML source"""

    def __init__(self, methodName='test'):
        self.washer = HTMLWasher()
        InvenioTestCase.__init__(self, methodName)

    def test_wash_html(self):
        """htmlutils - washing HTML tags"""

        # Simple test case
        test_str = 'Spam and <b><blink>eggs</blink></b>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'Spam and <b>eggs</b>')

        # Show 'escaped' tags
        test_str = 'Spam and <b><blink>eggs</blink></b>'
        self.assertEqual(self.washer.wash(html_buffer=test_str,
                                          render_unallowed_tags=True),
                         'Spam and <b>&lt;blink&gt;eggs&lt;/blink&gt;</b>')

        # Keep entity and character references
        test_str = '<b> a &lt; b &gt; c </b> &#247;'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         '<b> a &lt; b &gt; c </b> &#247;')

        # Remove content of <script> tags
        test_str = '<script type="text/javacript">alert("foo")</script>bar'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'bar')
        test_str = '<script type="text/javacript"><!--alert("foo")--></script>bar'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'bar')

        # Remove content of <style> tags
        test_str = '<style>.myclass {color:#f00}</style><span class="myclass">styled text</span>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'styled text')
        test_str = '<style><!-- .myclass {color:#f00} --></style><span class="myclass">styled text</span>'
        self.assertEqual(self.washer.wash(html_buffer=test_str),
                         'styled text')

Exemple #5

0

Afficher le fichier

Fichier : test_utils_html.py Projet : chokribr/invenio-1

 def __init__(self, methodName='test'):
     self.washer = HTMLWasher()
     InvenioTestCase.__init__(self, methodName)

Exemple #6

0

Afficher le fichier

Fichier : bfe_webjournal_article_body.py Projet : chokribr/invenio-1

def format_element(bfo, separator='<br/>', max_chars=""):
    """
    Display article body

    @param separator: separator between each body
    @param max_chars: if defined, limit the output to given char length
    """
    ln = bfo.lang
    _ = gettext_set_language(ln)

    if ln == "fr":
        article = bfo.fields('590__b')
        if not article or \
               (len(article) == 1 and \
                (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])):
            article = bfo.fields('520__b')
    else:
        article = bfo.fields('520__b')
        if not article or \
               (len(article) == 1 and \
                (article[0].strip() in ['', '<br />', '<!--HTML--><br />'])):
            article = bfo.fields('590__b')

    if not CFG_CERN_SITE or \
           not bfo.field('980__a').startswith('BULLETIN'):
        output = separator.join(article)
        if max_chars.isdigit() and \
               int(max_chars) > 0 and len(output) > int(max_chars):
            output = output[:int(max_chars)] + ' [...]'
        return output

    ################################################################
    #                  CERN Bulletin-specific code                 #
    ################################################################

    # We need a compatibility layer for old CERN Bulletin
    # articles. Identify them and process them if needed.
    is_old_cern_bulletin_article = False
    if bfo.field('980__a').startswith('BULLETIN'):
        try:
            year = int(bfo.fields('260__c')[0])
        except IndexError:
            year = 2000
        if year < 2009 or \
           (bfo.field('980__a').startswith('BULLETINSTAFF') and \
            ("CERN EDS" in bfo.field('595__a'))):
            is_old_cern_bulletin_article = True

    header_out = ''
    if not is_old_cern_bulletin_article:
        # Return the same as any other journal article
        output = separator.join(article)
        if max_chars.isdigit() and \
               int(max_chars) > 0 and len(output) > int(max_chars):
            output = output[:int(max_chars)] + ' [...]'
        return output

    # Old CERN articles
    if year < 2007 or bfo.field('980__a').startswith('BULLETINSTAFF'):
        # Really old CERN articles
        if len(article) > 0:
            # CERN-only: old CERN Bulletin articles
            return __backward_compatible_HTML(article[0]) + \
                   (bfo.field('980__a').startswith('BULLETINSTAFF') and \
                    ('<br/><br/>' + bfe_fulltext.format_element(bfo, style="", show_icons='yes')) \
                    or '')
        else:
            return ''

    # Not-so-old CERN articles follow:

    # 2. prepare regex's for the elements
    #=====================================================
    from invenio.legacy.webjournal.utils import \
         image_pattern, \
         para_pattern, \
         header_pattern

    page_elements = {}

    # 3. get the header (either from marc xml or regex)
    #=====================================================
    if bfo.lang == "fr":
        header = bfo.field('590__a')
        if header == '':
            header = bfo.field('520__a')
    else:
        header = bfo.field('520__a')
        if header == '':
            header = bfo.field('590__a')

    if not header:
        try:
            header_obj = re.search(header_pattern, article[0])
            header_text = header_obj.group("header")
        except:
            header_text = ""
    else:
        header_text = header


    washer = HTMLWasher()
    header_text_clean = washer.wash(html_buffer=header_text,
                                    allowed_tag_whitelist=['a'],
                                    allowed_attribute_whitelist=['href'])

    header_out = '<p class="articleHeader">' + header_text_clean + '</p>'

    # strip out all empty p tags and the header
    try:
        article = article[0].replace("<p/>", "")
        article = article.replace(header_text, "")
        article = article.replace(header_text_clean, "")
    except IndexError:
        article = ""

    image_iter = image_pattern.finditer(article)

    difference_from_original = 0
    for image in image_iter:
        page_elements[image.start()] = {"link" : image.group("hyperlink"),
                                        "image" : image.group("image"),
                                        "caption" : image.group("caption")}
        # make sure we delete the image from the article (else might be used twice)
        start_index = image.span()[0] - difference_from_original
        end_index = image.span()[1] - difference_from_original
        article = article.replace(article[start_index:end_index], "")
        difference_from_original += image.span()[1] - image.span()[0]


    # replace <center> by <p><center>
    article = article.replace("<center>", "<p><center>")
    article = article.replace("</center>", "</center></p>")

    para_iter = para_pattern.finditer(article)

    for paragraph in para_iter:
        page_elements[paragraph.start()] = paragraph.group("paragraph")


    # TODO: find a way to do this inline in the dict
    ordered_keys = page_elements.keys()
    ordered_keys.sort()

    article_out = ""
    left_right_lever = True
    did_you_know_box = False
    for key in ordered_keys:
        if type(page_elements[key]) == types.DictType:
            if left_right_lever == True:
                article_out += '<div class="phrwithcaption"><div class="imageScale">'
            else:
                article_out += '<div class="phlwithcaption"><div class="imageScale">'
            if page_elements[key]["link"] != None:
                article_out += '<a href="' + page_elements[key]["link"] + '">'
            article_out += '<img class="featureImageScaleHolder" src="' + \
                           page_elements[key]["image"] + '" border="0" />' + \
                           '</a>' + \
                           '</div>'
            if page_elements[key]["caption"] != None:
                article_out += '<p>' + page_elements[key]["caption"] + \
                               '</p>'
            article_out += '</div>'
        elif type(page_elements[key]) == types.StringType:
            left_right_lever = not left_right_lever
            if (page_elements[key].lower().find("did you know") != -1) or \
                   (page_elements[key].lower().find("le saviez-vous ?") != -1):
                did_you_know_box = True
                continue
            if did_you_know_box == True:
                did_you_know_box = False
                article_out += __did_you_know_box(page_elements[key],
                                                  left_right_lever,
                                                  bfo.lang)
                continue
            article_out += '<p>'
            article_out += page_elements[key]
            article_out += '</p>'

    output = header_out + article_out
    if max_chars.isdigit() and \
           int(max_chars) > 0 and len(output) > int(max_chars):
        output = output[:int(max_chars)] + ' [...]'

    return output

Exemple #7

0

Afficher le fichier

Fichier : bfe_webjournal_articles_overview.py Projet : mhellmic/b2share

def _get_feature_text(record, language):
    """
    Looks for a text (header) that can be featured on the article overview
    page.
    """
    washer = HTMLWasher()
    header_text = ""
    # Check if there is a header
    if language == "fr":
        header = record.field('590__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('520__a')
    else:
        header = record.field('520__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('590__a')
    header = washer.wash(html_buffer=header,
                         allowed_tag_whitelist=[],
                         allowed_attribute_whitelist=[])
    if header != "":
        header_text = header
    else:
        if language == "fr":
            article = record.fields('590__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('520__b')
        else:
            article = record.fields('520__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('590__b')
        try:
            article = article[0]
        except:
            return ''

        match_obj = re.search(header_pattern, article)
        if not match_obj:
            match_obj = re.search(header_pattern2, article)
        try:
            header_text = match_obj.group("header")
            header_text = washer.wash(html_buffer=header_text,
                                      allowed_tag_whitelist=['a'],
                                      allowed_attribute_whitelist=['href',
                                                                   'target',
                                                                   'class'])
            if header_text == "":
                raise Exception
        except:
            article = article.replace(header_text, '')
            article = article.replace('<p/>', '')
            article = article.replace('<p>&nbsp;</p>', '')
            match_obj = re.search(para_pattern, article)
            try:
                # get the first paragraph
                header_text = match_obj.group("paragraph")
                try:
                    header_text = washer.wash(html_buffer=header_text,
                                              allowed_tag_whitelist=[],
                                              allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    header_text = remove_html_markup(header_text)

                if header_text.strip() == "":
                    raise Exception
                else:
                    if len(header_text) > 250:
                        header_text = _get_first_sentence_or_part(header_text)
            except:
                # in a last instance get the first sentence
                try:
                    article = washer.wash(article,
                                          allowed_tag_whitelist=[],
                                          allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    article = remove_html_markup(article)

                header_text = _get_first_sentence_or_part(article)

    return header_text

Exemple #8

0

Afficher le fichier

Fichier : bfe_webjournal_articles_overview.py Projet : chokribr/invenio-1

def _get_feature_text(record, language):
    """
    Looks for a text (header) that can be featured on the article overview
    page.
    """
    washer = HTMLWasher()
    header_text = ""
    # Check if there is a header
    if language == "fr":
        header = record.field('590__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('520__a')
    else:
        header = record.field('520__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('590__a')
    header = washer.wash(html_buffer=header,
                         allowed_tag_whitelist=[],
                         allowed_attribute_whitelist=[])
    if header != "":
        header_text = header
    else:
        if language == "fr":
            article = record.fields('590__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('520__b')
        else:
            article = record.fields('520__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('590__b')
        try:
            article = article[0]
        except:
            return ''

        match_obj = re.search(header_pattern, article)
        if not match_obj:
            match_obj = re.search(header_pattern2, article)
        try:
            header_text = match_obj.group("header")
            header_text = washer.wash(
                html_buffer=header_text,
                allowed_tag_whitelist=['a'],
                allowed_attribute_whitelist=['href', 'target', 'class'])
            if header_text == "":
                raise Exception
        except:
            article = article.replace(header_text, '')
            article = article.replace('<p/>', '')
            article = article.replace('<p>&nbsp;</p>', '')
            match_obj = re.search(para_pattern, article)
            try:
                # get the first paragraph
                header_text = match_obj.group("paragraph")
                try:
                    header_text = washer.wash(html_buffer=header_text,
                                              allowed_tag_whitelist=[],
                                              allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    header_text = remove_html_markup(header_text)

                if header_text.strip() == "":
                    raise Exception
                else:
                    if len(header_text) > 250:
                        header_text = _get_first_sentence_or_part(header_text)
            except:
                # in a last instance get the first sentence
                try:
                    article = washer.wash(article,
                                          allowed_tag_whitelist=[],
                                          allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    article = remove_html_markup(article)

                header_text = _get_first_sentence_or_part(article)

    return header_text