def test_strip_accents(): # check some classical latin accentuated symbols a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb' expected = u'aaaaaaceeee' assert_equal(strip_accents(a), expected) a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd' expected = u'iiiinooooouuuuy' assert_equal(strip_accents(a), expected) # check some arabic a = u'\u0625' # halef with a hamza below expected = u'\u0627' # simple halef assert_equal(strip_accents(a), expected) # mix letters accentuated and not a = u"this is \xe0 test" expected = u'this is a test' assert_equal(strip_accents(a), expected)
def to_lower_ascii(d): if isinstance(d, pd.DataFrame): d = d.copy() d = d.convert_objects(convert_dates=True, convert_numeric=True) lower_ascii = lambda x: strip_accents(x).lower() d.columns = map(lower_ascii, d.columns) for c in d.columns: if d[c].dtype == 'O': d[c].fillna('', inplace=True) if d[c].dtype != 'float' and d[c].dtype != 'int': try: d[c] = map(lower_ascii, map(unicode, d[c])) except TypeError as e: print e.message return d else: raise NotImplementedError("the input format '{}' is not handled".format(type(d)))
def to_lower_ascii(d): if isinstance(d, pd.DataFrame): d = d.copy() d = d.convert_objects(convert_dates=True, convert_numeric=True) lower_ascii = lambda x: strip_accents(x).lower() d.columns = map(lower_ascii, d.columns) for c in d.columns: if d[c].dtype == 'O': d[c].fillna('', inplace=True) if d[c].dtype != 'float' and d[c].dtype != 'int': try: d[c] = map(lower_ascii, map(unicode, d[c])) except TypeError as e: print e.message return d else: raise NotImplementedError( "the input format '{}' is not handled".format(type(d)))