Beispiel #1
0
def get_transformer(profile, exception=None):
    
    profile = lp.csv2list(cddb_path('profiles', profile), strip_lines=False)
    for i, line in enumerate(profile):
        profile[i] = [unicodedata.normalize('NFD', clpa.normalize(x)) for x in line]
    tokenizer = Tokenizer(profile, errors_replace=lambda x: "«{0}»".format(x))
    
    return lambda x, y: unicodedata.normalize(
            'NFC',
            tokenizer.transform(clpa.normalize(x), column=y, separator=' + ')
            )
Beispiel #2
0
def inventories(dataset):
    clpa = get_clpa()
    files = glob(dataset.get_path('raw', 'inventories.tsv'))
    dialects = []

    t = Tokenizer(dataset.get_path('raw', 'profile.prf'))
    sounds = defaultdict(lambda: defaultdict(set))
    transform = lambda x, y: unicodedata.normalize('NFC', t.transform(x, y))
    invs = {l: [] for l in dataset.languages}
    for f in files:
        data = csv2list(f)
        for i, line in enumerate(data):
            number, dialect, page, sound, value, *rest = line
            if not rest: rest = ['']
            cddb = transform(value, 'CDDB')
            src = transform(value, 'SOURCE')
            struct = ' '.join(list(t.transform(value, 'STRUCTURE')))
            invs[dialect] += [[
                src.replace(' ', ''), cddb, struct, ', '.join(rest)
            ]]
            if len(struct.split()) != len(cddb.split()):
                print(i + 1, 'warn', struct, '   |   ', cddb)
    dataset.write_inventories(invs)
Beispiel #3
0
class TokenizerTestCase(unittest.TestCase):
    """ Tests for tokenizer.py """
    maxDiff = None  # for printing large output

    def setUp(self):
        self.t = Tokenizer(_test_path('test.prf'))

    def test_errors(self):
        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '<{0}>'.format(c))
        self.assertEqual(t('habe'), '<i> a b <e>')

        with self.assertRaises(ValueError):
            t('habe', form='xyz')

        with self.assertRaises(ValueError):
            t('habe', errors='strict')

        self.assertEqual(t('habe', errors='ignore'), 'a b')

    def test_boundaries(self):
        self.assertEqual(self.t('aa aa', separator=' _ '), ' b _  b')

    def test_normalization(self):
        t = Tokenizer()
        s = 'n\u0303a'
        self.assertEqual(t(s), 'n\u0303 a')
        self.assertEqual(t('\xf1a'), 'n\u0303 a')
        self.assertEqual(t(s, form='NFC'), '\xf1 a')

    def test_ipa(self):
        t = Tokenizer()
        self.assertEqual(t('\u02b0ello', ipa=True), '\u02b0e l l o')

    def test_tokenize_with_profile(self):
        self.assertEqual(self.t('aa'), ' b')

    def test_tokenize_with_profile_from_object(self):
        prf = Profile(dict(Grapheme='aa', mapping='xy'), dict(Grapheme='b', mapping='z'))
        self.assertEqual(Tokenizer(profile=prf)('aab', column='mapping'), 'xy z')

    def test_tokenize_without_profile(self):
        self.assertEqual(Tokenizer()('aa', form='NFC'), 'a a')

    def test_printTree(self):
        stream = StringIO()
        self.t.op.tree.printTree(self.t.op.tree.root, stream=stream)
        stream.seek(0)
        self.assertIn('a* -- a*', stream.read().split('\n'))
        printMultigraphs(self.t.op.tree.root, '', '')
        printMultigraphs(self.t.op.tree.root, 'abcd', '')

    def test_characters(self):
        t = Tokenizer()
        result = t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
        self.assertEqual(result, "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p")

    def test_grapheme_clusters(self):
        t = Tokenizer()
        result = t.grapheme_clusters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
        self.assertEqual(result, "ĉ h á ɾ ã̌ c t ʼ ɛ ↗ ʐ ː | # k͡ p")

    def test_graphemes(self):
        t = Tokenizer()
        self.assertEqual(t.graphemes("aabchonn-ih"), "a a b c h o n n - i h")
        self.assertEqual(self.t.graphemes("aabchonn-ih"), "aa b ch on n - ih")

    def test_transform1(self):
        self.assertEqual(self.t.transform("aabchonn-ih"), "aa b ch on n - ih")

        with self.assertRaises(ValueError):
            Tokenizer().transform('abc')

        with self.assertRaises(ValueError):
            self.assertEqual(self.t.transform("aabchonn-ih", 'xx'), "aa b ch on n - ih")

    def test_transform2(self):
        result = self.t.transform("aabchonn-ih", "IPA")
        self.assertEqual(result, "aː b tʃ õ n í")

    def test_transform3(self):
        result = self.t.transform("aabchonn-ih", "XSAMPA")
        self.assertEqual(result, "a: b tS o~ n i_H")

    def test_rules(self):
        self.assertEqual(Tokenizer().rules('abc'), 'abc')
        result = self.t.rules("aabchonn-ih")
        self.assertEqual(result, "  ii-ii")

    def test_transform_rules(self):
        result = self.t.transform_rules("aabchonn-ih")
        self.assertEqual(result, " b b ii - ii")

    def test_find_missing_characters(self):
        result = self.t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih \ufffd \ufffd \ufffd")

        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '?')
        result = t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih ? ? ?")