コード例 #1
0
ファイル: test_text.py プロジェクト: smidm/scikit-learn
def test_strip_accents():
    # check some classical latin accentuated symbols
    a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
    expected = u'aaaaaaceeee'
    assert_equal(strip_accents(a), expected)

    a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
    expected = u'iiiinooooouuuuy'
    assert_equal(strip_accents(a), expected)

    # check some arabic
    a = u'\u0625'  # halef with a hamza below
    expected = u'\u0627'  # simple halef
    assert_equal(strip_accents(a), expected)

    # mix letters accentuated and not
    a = u"this is \xe0 test"
    expected = u'this is a test'
    assert_equal(strip_accents(a), expected)
コード例 #2
0
ファイル: test_text.py プロジェクト: aravindgd/scikit-learn
def test_strip_accents():
    # check some classical latin accentuated symbols
    a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
    expected = u'aaaaaaceeee'
    assert_equal(strip_accents(a), expected)

    a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
    expected = u'iiiinooooouuuuy'
    assert_equal(strip_accents(a), expected)

    # check some arabic
    a = u'\u0625'  # halef with a hamza below
    expected = u'\u0627'  # simple halef
    assert_equal(strip_accents(a), expected)

    # mix letters accentuated and not
    a = u"this is \xe0 test"
    expected = u'this is a test'
    assert_equal(strip_accents(a), expected)
コード例 #3
0
ファイル: sutils.py プロジェクト: yz-/ut
def to_lower_ascii(d):
    if isinstance(d, pd.DataFrame):
        d = d.copy()
        d = d.convert_objects(convert_dates=True, convert_numeric=True)
        lower_ascii = lambda x: strip_accents(x).lower()
        d.columns = map(lower_ascii, d.columns)
        for c in d.columns:
            if d[c].dtype == 'O':
                d[c].fillna('', inplace=True)
            if d[c].dtype != 'float' and d[c].dtype != 'int':
                try:
                    d[c] = map(lower_ascii, map(unicode, d[c]))
                except TypeError as e:
                    print e.message
        return d
    else:
        raise NotImplementedError("the input format '{}' is not handled".format(type(d)))
コード例 #4
0
ファイル: sutils.py プロジェクト: SRHerzog/ut
def to_lower_ascii(d):
    if isinstance(d, pd.DataFrame):
        d = d.copy()
        d = d.convert_objects(convert_dates=True, convert_numeric=True)
        lower_ascii = lambda x: strip_accents(x).lower()
        d.columns = map(lower_ascii, d.columns)
        for c in d.columns:
            if d[c].dtype == 'O':
                d[c].fillna('', inplace=True)
            if d[c].dtype != 'float' and d[c].dtype != 'int':
                try:
                    d[c] = map(lower_ascii, map(unicode, d[c]))
                except TypeError as e:
                    print e.message
        return d
    else:
        raise NotImplementedError(
            "the input format '{}' is not handled".format(type(d)))