コード例 #1
0
ファイル: base.py プロジェクト: glottobank/clpa
    def check_sequence(self, seq, sounds=None, errors=None):
        if not isinstance(seq, (list, tuple)):
            new_seq = seq.split(' ')
        else:
            new_seq = [x for x in seq]
        if self.rules:
            new_seq = [self.rules[t] if t in self.rules else t for t in new_seq]

        new_tokens = []
        sounds = sounds or {}
        errors = errors or Counter({'convertable': 0, 'non-convertable': 0})

        for token in new_seq:
            accent = ''
            if token[0] in self.accents:
                accent, token = token[0], token[1:]

            if token in self.whitelist or token in sounds:
                if token in sounds:
                    sounds[token]['frequency'] += 1
                else:
                    sounds[token] = dict(
                        frequency=1, clpa=token, id=self.whitelist[token]['ID'])
            else:
                check = find_token(
                    token, self.whitelist, self.alias, self.explicit,
                    self.patterns, self.delete)
                sounds[token] = dict(
                    frequency=1,
                    clpa=check if check else '?',
                    id=self.whitelist[check]['ID'] if check else '?')
                errors.update(['convertable' if check else 'non-convertable'])

            new_tokens.append(accent + sounds[token]['clpa'])
        return new_tokens, sounds, errors
コード例 #2
0
    def test_find_token(self):
        from pyclpa.util import find_token, load_whitelist, load_alias

        wl = load_whitelist()
        patterns = load_alias('patterns.tsv')
        assert not find_token('t', {}, {}, {}, {}, [])
        assert find_token('t', wl, {}, {}, {}, []) == 't'
        assert find_token('th', wl, {'h': 'ʰ'}, {}, {}, []) == 'tʰ'
        assert find_token('th', wl, {}, {'th': 'x'}, {}, []) == 'x'
        with self.assertRaises(ValueError):
            find_token('th', wl, {}, {'th': 'X'}, {}, [])
        assert find_token('th', wl, {}, {}, patterns, []) == 'tʰ'
        assert find_token('th', wl, {}, {}, {}, ['h']) == 't'
コード例 #3
0
ファイル: base.py プロジェクト: LinguList/clpa
    def check_sequence(self, seq, sounds=None, errors=None):
        if not isinstance(seq, (list, tuple)):
            new_seq = seq.split(' ')
        else:
            new_seq = [x for x in seq]
        if self.rules:
            new_seq = [
                self.rules[t] if t in self.rules else t for t in new_seq
            ]

        new_tokens = []
        sounds = sounds or {}
        errors = errors or Counter({
            'convertable': 0,
            'non-convertable': 0,
            'custom': 0
        })

        for token in new_seq:
            accent = ''
            if token[0] in self.accents:
                accent, token = token[0], token[1:]

            if token in self.whitelist or token in sounds:
                if token in sounds:
                    sounds[token]['frequency'] += 1
                else:
                    sounds[token] = dict(frequency=1,
                                         clpa=token,
                                         id=self.whitelist[token]['ID'])
            elif token[-1] == '/' and len(token) > 1:
                tkn = token[:-1]
                if token in sounds:
                    sounds[token]['frequency'] += 1
                else:
                    idf = 'custom:{0}'.format(tkn)
                    sounds[token] = dict(frequency=1, clpa='*' + tkn, id=idf)
                    errors.update(['custom'])
            else:
                check = find_token(token, self.whitelist, self.alias,
                                   self.explicit, self.patterns, self.delete)
                sounds[token] = dict(
                    frequency=1,
                    clpa=check if check else '?',
                    id=self.whitelist[check]['ID'] if check else '?')
                errors.update(['convertable' if check else 'non-convertable'])

            new_tokens.append(accent + sounds[token]['clpa'])
        return new_tokens, sounds, errors