def test_mix_detected(self): with self.subTest('summary'): self.assertEqual(LangCat.MIX, LangCat.categorize('일=one\n이=two', False)) with self.subTest('detail'): self.assertEqual({LangCat.HAN, LangCat.ENG}, LangCat.categorize('일=two\n이=two', True))
def test_punc_num_ignored_for_categorize(self): with self.subTest('summary'): self.assertEqual(LangCat.HAN, LangCat.categorize('일=1\n이=2', False)) with self.subTest('detail'): self.assertEqual({LangCat.HAN}, LangCat.categorize('일=1\n이=2', True))
def from_parts(cls, parts: Iterable[str], **kwargs) -> 'Name': eng = None non_eng = None extra = [] name = None for part in parts: if not part: continue elif name is not None: extra.append(part) elif not non_eng and LangCat.contains_any(part, LangCat.non_eng_cats): non_eng = part elif not eng and LangCat.contains_any(part, LangCat.ENG): eng = part elif eng and non_eng and LangCat.categorize(part) == LangCat.ENG: name = cls(eng, non_eng, **kwargs) if name.has_romanization(part): name.romanized = part elif name.has_romanization( eng) and not is_english(eng) and is_english(part): name._english = part name.romanized = eng else: name = None extra.append(part) else: extra.append(part) if name is None: if eng or non_eng: name = cls(eng, non_eng, **kwargs) elif extra and len(extra) == 1: name = cls(extra[0], **kwargs) extra = None if name is None: raise ValueError( f'Unable to find any valid name parts from {parts!r}; found {extra=!r}' ) if extra: if name.extra: name.extra['unknown'] = extra # noqa else: name.extra = {'unknown': extra} return name
def _split_non_eng_lit(name_parts_str: str): # log.debug(f'Splitting: {name_parts_str!r}') non_eng, lit_translation = None, None if name_parts_str.startswith('('): name_parts_str = parenthesized(name_parts_str) if name_parts_str and LangCat.contains_any(name_parts_str, LangCat.asian_cats): name_parts = tuple(map(str.strip, name_parts_str.split(';'))) if len(name_parts) == 1: non_eng = name_parts[0] elif len(name_parts) == 2: non_eng, lit_translation = name_parts else: raise ValueError(f'Unexpected name parts format: {name_parts_str!r}') return non_eng, lit_translation
def sort_name_parts(parts: Iterable[str]) -> list[Optional[str]]: parts = list(p.value for p in sorted( _NamePart(i, part) for i, part in enumerate(parts))) if parts and not LangCat.contains_any(parts[0], LangCat.ENG): parts.insert(0, None) return parts
def __init__(self, pos: int, value: str): self.pos = pos self.value = value self.cat = LangCat.categorize(value)
def split(self) -> 'Name': return self.from_parts(LangCat.split(self.english), versions={self, Name(non_eng=self.english)})
def from_enclosed(cls, name: str, **kwargs) -> 'Name': if LangCat.categorize(name) == LangCat.MIX: parts = split_enclosed(name, reverse=True, maxsplit=1) else: parts = (name, ) return cls.from_parts(parts, **kwargs)
def non_eng_langs(self) -> set[LangCat]: return LangCat.categorize(self.non_eng, True)
def non_eng_lang(self) -> LangCat: return LangCat.categorize(self.non_eng)
def eng_langs(self) -> set[LangCat]: return LangCat.categorize(self.english, True)
def eng_lang(self) -> LangCat: return LangCat.categorize(self.english)
def test_spaces_ignored_for_categorize(self): with self.subTest('summary'): self.assertEqual(LangCat.HAN, LangCat.categorize('일 이', False)) with self.subTest('detail'): self.assertEqual({LangCat.HAN}, LangCat.categorize('일 이', True))