def test_unquote_markup(self): self.assertEqual(unquote_markup(self.sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1>""") self.assertEqual(unquote_markup(self.sample_txt2), u'<node2>blah&blahblahblahblah!£moreblah<></node2>') self.assertEqual(unquote_markup(self.sample_txt1 + self.sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""") self.assertEqual(unquote_markup(self.sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
def __call__(self, values, loader_context=None): values = super(Url, self).__call__(values) urls = [] for value in values: if isinstance(value, (dict, list)): urls.append(value) value = _strip_url(unquote_markup(value)) base = loader_context.get('baseurl', '') urls.append(urljoin(base, value)) return urls
def test_unquote_markup(self): sample_txt1 = u"""<node1>hi, this is sample text with entities: & © <![CDATA[although this is inside a cdata! & "]]></node1>""" sample_txt2 = u'<node2>blah&blah<![CDATA[blahblahblah!£]]>moreblah<></node2>' sample_txt3 = u'something£&more<node3><![CDATA[things, stuff, and such]]>what"ever</node3><node4' # make sure it always return unicode assert isinstance(unquote_markup(sample_txt1.encode('latin-1')), unicode) assert isinstance(unquote_markup(sample_txt2), unicode) self.assertEqual( unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1>""") self.assertEqual( unquote_markup(sample_txt2), u'<node2>blah&blahblahblahblah!£moreblah<></node2>') self.assertEqual( unquote_markup(sample_txt1 + sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""" ) self.assertEqual( unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4' )
def test_unquote_markup(self): sample_txt1 = u"""<node1>hi, this is sample text with entities: & © <![CDATA[although this is inside a cdata! & "]]></node1>""" sample_txt2 = u"<node2>blah&blah<![CDATA[blahblahblah!£]]>moreblah<></node2>" sample_txt3 = u"something£&more<node3><![CDATA[things, stuff, and such]]>what"ever</node3><node4" # make sure it always return unicode assert isinstance(unquote_markup(sample_txt1.encode("latin-1")), unicode) assert isinstance(unquote_markup(sample_txt2), unicode) self.assertEqual( unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1>""", ) self.assertEqual(unquote_markup(sample_txt2), u"<node2>blah&blahblahblahblah!£moreblah<></node2>") self.assertEqual( unquote_markup(sample_txt1 + sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""", ) self.assertEqual( unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4' )
def clean_content(self, text): """ Return a string of text cleaned up by tags, entities, escape chars, quotes and spaces """ temp = remove_tags_with_content(text, which_ones=('style', 'script', 'figcaption')) temp = remove_tags(temp) temp = remove_entities(temp) temp = replace_escape_chars(temp) temp = unquote_markup(temp) temp = " ".join(temp.split()) return temp
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), str) assert isinstance(unquote_markup(self.sample_txt2), str)
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type) assert isinstance(unquote_markup(self.sample_txt2), six.text_type)