Beispiel #1
0
    def test_unquote_markup(self):
        self.assertEqual(unquote_markup(self.sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1>""")

        self.assertEqual(unquote_markup(self.sample_txt2), u'<node2>blah&blahblahblahblah!&pound;moreblah<></node2>')

        self.assertEqual(unquote_markup(self.sample_txt1 + self.sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1><node2>blah&blahblahblahblah!&pound;moreblah<></node2>""")

        self.assertEqual(unquote_markup(self.sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
Beispiel #2
0
    def test_unquote_markup(self):
        self.assertEqual(unquote_markup(self.sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1>""")

        self.assertEqual(unquote_markup(self.sample_txt2), u'<node2>blah&blahblahblahblah!&pound;moreblah<></node2>')

        self.assertEqual(unquote_markup(self.sample_txt1 + self.sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1><node2>blah&blahblahblahblah!&pound;moreblah<></node2>""")

        self.assertEqual(unquote_markup(self.sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
Beispiel #3
0
 def __call__(self, values, loader_context=None):
     values = super(Url, self).__call__(values)
     urls = []
     for value in values:
         if isinstance(value, (dict, list)):
             urls.append(value)
         value = _strip_url(unquote_markup(value))
         base = loader_context.get('baseurl', '')
         urls.append(urljoin(base, value))
     return urls
Beispiel #4
0
    def test_unquote_markup(self):
        sample_txt1 = u"""<node1>hi, this is sample text with entities: &amp; &copy;
<![CDATA[although this is inside a cdata! &amp; &quot;]]></node1>"""
        sample_txt2 = u'<node2>blah&amp;blah<![CDATA[blahblahblah!&pound;]]>moreblah&lt;&gt;</node2>'
        sample_txt3 = u'something&pound;&amp;more<node3><![CDATA[things, stuff, and such]]>what&quot;ever</node3><node4'

        # make sure it always return unicode
        assert isinstance(unquote_markup(sample_txt1.encode('latin-1')),
                          unicode)
        assert isinstance(unquote_markup(sample_txt2), unicode)

        self.assertEqual(
            unquote_markup(sample_txt1),
            u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1>""")

        self.assertEqual(
            unquote_markup(sample_txt2),
            u'<node2>blah&blahblahblahblah!&pound;moreblah<></node2>')

        self.assertEqual(
            unquote_markup(sample_txt1 + sample_txt2),
            u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1><node2>blah&blahblahblahblah!&pound;moreblah<></node2>"""
        )

        self.assertEqual(
            unquote_markup(sample_txt3),
            u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4'
        )
Beispiel #5
0
    def test_unquote_markup(self):
        sample_txt1 = u"""<node1>hi, this is sample text with entities: &amp; &copy;
<![CDATA[although this is inside a cdata! &amp; &quot;]]></node1>"""
        sample_txt2 = u"<node2>blah&amp;blah<![CDATA[blahblahblah!&pound;]]>moreblah&lt;&gt;</node2>"
        sample_txt3 = u"something&pound;&amp;more<node3><![CDATA[things, stuff, and such]]>what&quot;ever</node3><node4"

        # make sure it always return unicode
        assert isinstance(unquote_markup(sample_txt1.encode("latin-1")), unicode)
        assert isinstance(unquote_markup(sample_txt2), unicode)

        self.assertEqual(
            unquote_markup(sample_txt1),
            u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1>""",
        )

        self.assertEqual(unquote_markup(sample_txt2), u"<node2>blah&blahblahblahblah!&pound;moreblah<></node2>")

        self.assertEqual(
            unquote_markup(sample_txt1 + sample_txt2),
            u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1><node2>blah&blahblahblahblah!&pound;moreblah<></node2>""",
        )

        self.assertEqual(
            unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4'
        )
Beispiel #6
0
    def clean_content(self, text):
        """
        Return a string of text cleaned up by tags, entities,
        escape chars, quotes and spaces
        """

        temp = remove_tags_with_content(text,
                                        which_ones=('style', 'script',
                                                    'figcaption'))
        temp = remove_tags(temp)
        temp = remove_entities(temp)
        temp = replace_escape_chars(temp)
        temp = unquote_markup(temp)
        temp = " ".join(temp.split())
        return temp
Beispiel #7
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')),
                       str)
     assert isinstance(unquote_markup(self.sample_txt2), str)
Beispiel #8
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type)
     assert isinstance(unquote_markup(self.sample_txt2), six.text_type)