def check_sequence(self, seq, sounds=None, errors=None): if not isinstance(seq, (list, tuple)): new_seq = seq.split(' ') else: new_seq = [x for x in seq] if self.rules: new_seq = [self.rules[t] if t in self.rules else t for t in new_seq] new_tokens = [] sounds = sounds or {} errors = errors or Counter({'convertable': 0, 'non-convertable': 0}) for token in new_seq: accent = '' if token[0] in self.accents: accent, token = token[0], token[1:] if token in self.whitelist or token in sounds: if token in sounds: sounds[token]['frequency'] += 1 else: sounds[token] = dict( frequency=1, clpa=token, id=self.whitelist[token]['ID']) else: check = find_token( token, self.whitelist, self.alias, self.explicit, self.patterns, self.delete) sounds[token] = dict( frequency=1, clpa=check if check else '?', id=self.whitelist[check]['ID'] if check else '?') errors.update(['convertable' if check else 'non-convertable']) new_tokens.append(accent + sounds[token]['clpa']) return new_tokens, sounds, errors
def test_find_token(self): from pyclpa.util import find_token, load_whitelist, load_alias wl = load_whitelist() patterns = load_alias('patterns.tsv') assert not find_token('t', {}, {}, {}, {}, []) assert find_token('t', wl, {}, {}, {}, []) == 't' assert find_token('th', wl, {'h': 'ʰ'}, {}, {}, []) == 'tʰ' assert find_token('th', wl, {}, {'th': 'x'}, {}, []) == 'x' with self.assertRaises(ValueError): find_token('th', wl, {}, {'th': 'X'}, {}, []) assert find_token('th', wl, {}, {}, patterns, []) == 'tʰ' assert find_token('th', wl, {}, {}, {}, ['h']) == 't'
def check_sequence(self, seq, sounds=None, errors=None): if not isinstance(seq, (list, tuple)): new_seq = seq.split(' ') else: new_seq = [x for x in seq] if self.rules: new_seq = [ self.rules[t] if t in self.rules else t for t in new_seq ] new_tokens = [] sounds = sounds or {} errors = errors or Counter({ 'convertable': 0, 'non-convertable': 0, 'custom': 0 }) for token in new_seq: accent = '' if token[0] in self.accents: accent, token = token[0], token[1:] if token in self.whitelist or token in sounds: if token in sounds: sounds[token]['frequency'] += 1 else: sounds[token] = dict(frequency=1, clpa=token, id=self.whitelist[token]['ID']) elif token[-1] == '/' and len(token) > 1: tkn = token[:-1] if token in sounds: sounds[token]['frequency'] += 1 else: idf = 'custom:{0}'.format(tkn) sounds[token] = dict(frequency=1, clpa='*' + tkn, id=idf) errors.update(['custom']) else: check = find_token(token, self.whitelist, self.alias, self.explicit, self.patterns, self.delete) sounds[token] = dict( frequency=1, clpa=check if check else '?', id=self.whitelist[check]['ID'] if check else '?') errors.update(['convertable' if check else 'non-convertable']) new_tokens.append(accent + sounds[token]['clpa']) return new_tokens, sounds, errors