def _to_paradigm(lexeme): """ Extract (stem, paradigm) pair from lexeme (which is a list of (word_form, tag) tuples). Paradigm is a list of suffixes with associated tags and prefixes. """ forms, tags = list(zip(*lexeme)) prefixes = [''] * len(tags) if len(forms) == 1: stem = forms[0] else: stem = longest_common_substring(forms) prefixes = [form[:form.index(stem)] for form in forms] # only allow prefixes from PARADIGM_PREFIXES if any(pref not in PARADIGM_PREFIXES for pref in prefixes): stem = "" prefixes = [''] * len(tags) suffixes = ( form[len(pref)+len(stem):] for form, pref in zip(forms, prefixes) ) return stem, tuple(zip(suffixes, tags, prefixes))
def _to_paradigm(lexeme, paradigm_prefixes): """ Extract (stem, paradigm) pair from lexeme (which is a list of (word_form, tag) tuples). Paradigm is a list of suffixes with associated tags and prefixes. """ forms, tags = list(zip(*lexeme)) if len(forms) == 1: stem = forms[0] prefixes = [''] else: stem = longest_common_substring(forms) prefixes = [form[:form.index(stem)] for form in forms] # only allow prefixes from PARADIGM_PREFIXES if any(pref not in paradigm_prefixes for pref in prefixes): # With right PARADIGM_PREFIXES empty stem is fine; # os.path.commonprefix doesn't return anything useful # for prediction. # stem = os.path.commonprefix(forms) stem = "" prefixes = [''] * len(tags) suffixes = ( form[len(pref)+len(stem):] for form, pref in zip(forms, prefixes) ) return stem, tuple(zip(suffixes, tags, prefixes))
def _to_paradigm(lexeme, paradigm_prefixes): """ Extract (stem, paradigm) pair from lexeme (which is a list of (word_form, tag) tuples). Paradigm is a list of suffixes with associated tags and prefixes. """ forms, tags = list(zip(*lexeme)) if len(forms) == 1: stem = forms[0] prefixes = [''] else: stem = longest_common_substring(forms) prefixes = [form[:form.index(stem)] for form in forms] # only allow prefixes from PARADIGM_PREFIXES if any(pref not in paradigm_prefixes for pref in prefixes): # With right PARADIGM_PREFIXES empty stem is fine; # os.path.commonprefix doesn't return anything useful # for prediction. # stem = os.path.commonprefix(forms) stem = "" prefixes = [''] * len(tags) suffixes = (form[len(pref) + len(stem):] for form, pref in zip(forms, prefixes)) return stem, tuple(zip(suffixes, tags, prefixes))