def test_decode_to_unicode(self): """textutils - decode_to_unicode.""" self.assertEqual( decode_to_unicode('\202\203\204\205', default_encoding='latin1'), u'\x82\x83\x84\x85') self.assertEqual(decode_to_unicode('àèéìòù'), u'\xe0\xe8\xe9\xec\xf2\xf9') self.assertEqual(decode_to_unicode('Ιθάκη'), u'\u0399\u03b8\u03ac\u03ba\u03b7')
def _sort_alphanumerically_remove_leading_articles(self, val): """ Convert: 'The title' => 'title' 'A title' => 'title' 'Title' => 'title' """ if not val: return '' val = decode_to_unicode(val).lower().encode('UTF-8') val_tokens = val.split(" ", 1) #split in leading_word, phrase_without_leading_word if len(val_tokens) == 2 and val_tokens[0].strip() in LEADING_ARTICLES: return val_tokens[1].strip() return val.strip()
def to_unicode(s): if isinstance(s, unicode): return s if isinstance(s, basestring): return decode_to_unicode(s) return unicode(s)
def test_decode_to_unicode(self): """textutils - decode_to_unicode.""" self.assertEqual(decode_to_unicode('\202\203\204\205', failover_encoding='latin1'), u'\x82\x83\x84\x85') self.assertEqual(decode_to_unicode('àèéìòù'), u'\xe0\xe8\xe9\xec\xf2\xf9') self.assertEqual(decode_to_unicode('Ιθάκη'), u'\u0399\u03b8\u03ac\u03ba\u03b7')
def test_decode_to_unicode(self): """textutils - decode_to_unicode.""" self.assertEqual(decode_to_unicode("\202\203\204\205", default_encoding="latin1"), u"\x82\x83\x84\x85") self.assertEqual(decode_to_unicode("àèéìòù"), u"\xe0\xe8\xe9\xec\xf2\xf9") self.assertEqual(decode_to_unicode("Ιθάκη"), u"\u0399\u03b8\u03ac\u03ba\u03b7")
def _sort_case_insensitive(self, val): """Conversion to lower case""" if not val: return '' return decode_to_unicode(val).lower().encode('UTF-8')