def _get_variant_config(rules, normalization_rules): """ Convert the variant definition from the configuration into replacement sets. """ immediate = defaultdict(list) chars = set() if rules: vset = set() rules = flatten_config_list(rules, 'variants') vmaker = _VariantMaker(normalization_rules) for section in rules: for rule in (section.get('words') or []): vset.update(vmaker.compute(rule)) # Intermediate reorder by source. Also compute required character set. for variant in vset: if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': replstr = variant.replacement[:-1] else: replstr = variant.replacement immediate[variant.source].append(replstr) chars.update(variant.source) return list(immediate.items()), ''.join(chars)
def test_flatten_config_list_nested(): content = [ 34, [{ 'first': '1st', 'second': '2nd' }, {}], [[2, 3], [45, [56, 78], 66]], 'end' ] assert flatten_config_list(content) == \ [34, {'first': '1st', 'second': '2nd'}, {}, 2, 3, 45, 56, 78, 66, 'end']
def _cfg_to_icu_rules(rules, section): """ Load an ICU ruleset from the given section. If the section is a simple string, it is interpreted as a file name and the rules are loaded verbatim from the given file. The filename is expected to be relative to the tokenizer rule file. If the section is a list then each line is assumed to be a rule. All rules are concatenated and returned. """ content = _get_section(rules, section) if content is None: return '' return ';'.join(flatten_config_list(content, section)) + ';'
def test_flatten_config_list_allready_flat(): assert flatten_config_list([1, 2, 456]) == [1, 2, 456]
def test_flatten_config_list_no_list(content): with pytest.raises(UsageError): flatten_config_list(content)
def test_flatten_config_list_empty(content): assert flatten_config_list(content) == []