コード例 #1
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def _decompose_suffix_blend(self, word, res=None):
        """Decomposing suffixes with replacements (typically due to the
        blending of identical sounds at the affix border).

        Example:

                >>> test_dict._decompose_suffix_e("historic")
                {'suffixes': ['-ic'], 'prefixes': [], 'roots': ['history'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)

        for suffix in self.suffixes:

            if word.endswith(suffix) and len(word[:-len(suffix)]) > 2:

                for pair in self.replacements_in_suffixes:
                    if suffix[0] == pair[1]:

                        candidate = word.strip(suffix) + pair[0]
                        if self.dictionary.is_a_word(candidate):
                            res["suffixes"].append(dash_suffix(suffix))
                            res["roots"].append(candidate)
        return res
コード例 #2
0
ファイル: en.py プロジェクト: danielbis/ldt
    def _decompose_suffix_e(self, word, res=None):
        """Decomposing consonant suffixes before which final "e" was dropped.

        Example:

                >>> test_dict._decompose_suffix_e("imaginable")
                {'suffixes': ['-able'], 'prefixes': [], 'roots': ['imagine'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)

        for suffix in self.suffixes:

            if word.endswith(suffix) and self.is_a_vowel(suffix[0]):

                candidate = word[:-len(suffix)] + "e"
                if self.dictionary.is_a_word(candidate):
                    res["suffixes"].append(dash_suffix(suffix))
                    res["roots"].append(candidate)
        return res
コード例 #3
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def _decompose_suffix_doubling(self, word, res=None):
        """Decomposing vowel suffixes that led to doubling of the final
        consonant of the root.

        Example:

                >>> test_dict._decompose_suffix_doubling("kingdom")
                {'suffixes': ['-dom'], 'prefixes': [], 'roots': ['king'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)

        for suffix in self.suffixes:

            if self.is_a_vowel(suffix[0]):
                if word.endswith(suffix) and len(word[:-len(suffix)]) > 2:
                    if word[-(len(suffix) + 1)] == word[-(len(suffix) + 2)]:
                        candidate = word[:-len(suffix) - 1]
                        if self.dictionary.is_a_word(candidate):
                            res["suffixes"].append(dash_suffix(suffix))
                            res["suffixes"].append(suffix)
                            res["roots"].append(candidate)
        return res
コード例 #4
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def _decompose_suffix_simple(self, word, res=None):
        """The most basic decomposition of suffixes: no change to the stem.

        Example:

                >>> test_dict._decompose_suffix_simple("kingdom")
                {'suffixes': ['-dom'], 'prefixes': [], 'roots': ['king'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)

        for suffix in self.suffixes:

            if word.endswith(suffix) and len(word[:-len(suffix)]) > 2:

                candidate = word[:-len(suffix)]
                if self.dictionary.is_a_word(candidate):
                    res["suffixes"].append(dash_suffix(suffix))
                    res["roots"].append(candidate)
        return res
コード例 #5
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def decompose_prefixes(self, word, res=None):
        """Basic checking a list of productive prefixes
        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """

        res = _check_res(res)

        for prefix in self.prefixes:
            if word.startswith(prefix):
                if word.startswith(prefix +
                                   "-") and len(word) > len(prefix) + 1:
                    # if ldt.dict.noise.is_a_word(word[len(p) + 1:]):
                    if self.dictionary.is_a_word(word[len(prefix) + 1:]):
                        res["prefixes"].append(prefix + "-")
                        res["roots"].append(word[len(prefix) + 1:])
                else:
                    if len(word) > len(prefix) + 2:
                        if self.dictionary.is_a_word(word[len(prefix):]):
                            res["prefixes"].append(prefix + "-")
                            res["roots"].append(word[len(prefix):])
        return res
コード例 #6
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def analyze_affixes(self, word, depth=2):
        """Combining the analysis of prefixes, suffixes and exceptions.

        Example:
            >>> test_dict.decompose_affixes("anti-intellectual")
            {'suffixes': ['-ual'], 'prefixes': ['anti-'],
             'roots': ['intellect', 'intellectual'], 'other': [],
             'original_word': ['anti-intellectual']}

        Args:
            word (str): a potential nonce-word
            depth (int): how many passes of analysis to make

        Returns:
            (dict): updated or newly created dictionary with derivational data
        """
        res = _check_res(res=None)
        res["original_word"] = [word]
        # start the list of stems to process

        processed_stems = set()
        stems_to_process = [word]
        if "-" in word:
            stems_to_process.append(word.replace("-", ""))

        processed_r = set()
        processed_p = set()
        processed_s = set()

        to_process = [k for k in stems_to_process if not k in processed_stems]

        for counter in range(10):

            subword = to_process[counter]

            if not subword in processed_r:
                processed_r.add(subword)
                res = self.check_exceptions(subword, res)
                to_process = to_process + \
                             [i for i in res["roots"] if not i in to_process]

            if not subword in processed_p:
                processed_p.add(subword)
                res = self.decompose_prefixes(subword, res)
                to_process = to_process + \
                             [i for i in res["roots"] if not i in to_process]

            if not subword in processed_s:
                processed_s.add(subword)
                res = self.decompose_suffixes(subword, res)
                to_process = to_process + \
                             [i for i in res["roots"] if not i in to_process]

            processed_stems.add(subword)
            if counter == depth or counter == len(to_process) - 1:
                for key in res:
                    res[key] = list(set(res[key]))
                return res
コード例 #7
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def decompose_suffixes(self, word, res=None):
        """The basic method for decomposing words with suffixes.

        The language-specific lists of affixes and replacement patterns are
        provided as yaml files in the corresponding submodules of
        `ldt.dicts.derivation.custom`.

        The currently implemented patterns include:

            * simple appending of suffixes (kingdom > king + -dom)
                (:meth:`_decompose_suffix_simple`)
            * doubling of final consonants (stopper > stop + -er)
                (:meth:`_decompose_suffix_doubling`)
            * replacements before vocalic and consonantal suffixes (happily >
                happy + -ly) (:meth:`_decompose_suffix_replacements`)
            * insertions before vocalic and consonantal suffixes (imaginable >
                imagine + -able) (:meth:`_decompose_suffix_insertions`)
            * blending of the end of the stem with the beginning of the suffix
                (historic > history + -ic) (:meth:`_decompose_suffix_blend`)

        In addition to that, :meth:`_decompose_language_specific_suffixes`
        binds any additional language-specific methods, which will be
        processed before the above generic ones.

        The basic abstract class just includes the simple suffix
        addition. Override this method with any combination of the above for
        your language.

        Example:

                >>> test_dict.decompose_suffixes("kingdom")
                {'suffixes': ['-dom'], 'prefixes': [], 'roots': ['king'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)
        res = self._decompose_suffix_simple(word, res)
        # res = self._decompose_suffix_doubling(word, res)
        # res = self._decompose_suffix_replacements(word, res)
        # res = self._decompose_suffix_insertions(word, res)
        # res = self._decompose_suffix_blend(word, res)
        return res
コード例 #8
0
ファイル: en.py プロジェクト: danielbis/ldt
    def _suffix_sion(self, word, res=None):
        """ Custom processing of -sion suffix, which has several idiosyncratic
        patterns and exceptions.

        Example:

                >>> test_dict._suffix_sion("corrosion")
                {'suffixes': ['-sion'], 'prefixes': [], 'roots': ['corrode'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """

        res = _check_res(res)

        if word in ["mission", "passion", "session", "pension"]:
            return res

        if word.endswith("ssion"):
            if self.dictionary.is_a_word(word[:-3]):
                return word[:-3]
            else:
                endings = ["d", "t", "de", "se"]
                for end in endings:
                    for i in [word[:-4] + end, word[:-5] + end]:
                        if self.dictionary.is_a_word(i):
                            res["suffixes"].append("-sion")
                            res["roots"].append(i)
        elif word.endswith("sion"):
            endings = ["d", "t", "de", "se"]
            for end in endings:
                for i in [word[:-3] + end, word[:-4] + end]:
                    if self.dictionary.is_a_word(i):
                        res["suffixes"].append("-sion")
                        res["roots"].append(i)
        return res
コード例 #9
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def _decompose_suffix_replacements(self, word, res=None):
        """Decomposing suffixes with phonetic changes before vocalic or
        consonantal suffixes.

        Example:

                >>> test_dict._decompose_suffix_replacements("happily")
                {'suffixes': ['-ly'], 'prefixes': [], 'roots': ['happy'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)

        for suffix in self.suffixes:

            if word.endswith(suffix) and len(word[:-len(suffix)]) > 2:

                if self.is_a_vowel(suffix[0]):
                    replacements = self.replacements_before_vowels
                else:
                    replacements = self.replacements_before_consonants

                for pair in replacements:

                    if word.endswith(pair[1] + suffix):
                        candidate = rreplace(word.strip(suffix), pair[1],
                                             pair[0])
                        if self.dictionary.is_a_word(candidate):
                            res["suffixes"].append(dash_suffix(suffix))
                            res["roots"].append(candidate)
            # else:
        return res
コード例 #10
0
ファイル: compounds.py プロジェクト: florianmai/ldt
    def decompose_compound(self, word, res=None, split_known_words=True):
        """ Combined analysis of compounds: dashed words and recursive
        splitting, with lemmatization and optional language-specific
        replacement and insertion patterns (see :meth:`_in_vocab`).

        Example:
            >>> test_dict.split_on_dash("god-knows-what")
            {'suffixes': [], 'prefixes': [], 'roots': ['god', 'know', 'what'],
            'other': [], 'original_word': 'god-knows-what'}

        Args:
            word (str): the word to analyze

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """

        res = _check_res(res)

        if "-" in word:
            return self.split_on_dash(word, res)

        if len(word) < 6:
            return res

        if not split_known_words:
            lemmas = self.morph_dictionary.lemmatize(word)
            for lemma in lemmas:
                if self.dictionary.is_a_word(lemma):
                    res["original_word"] = word
                    return res

        splits = self.split_compound(word)
        if splits:
            for split in splits:
                res["roots"] += split
        return res
コード例 #11
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def check_exceptions(self, word, res=None):
        """Method for retrieving derivational info that requires only simple
        lookup in in the `DerivationCustomDict.exceptions`.

        Args:
            word_ (str): the word to analyze
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """

        res = _check_res(res)

        if not self.equidistant_patterns:
            for affix in self.exceptions.keys():
                for key, value in self.exceptions[affix].items():
                    if key == word:
                        res["suffixes"].append(affix)
                        res["roots"].append(value)
        else:
            for affix in self.exceptions.keys():
                for key, value in self.exceptions[affix].items():
                    if key == word:
                        if "root_vowel" in affix:
                            res["other"].append(affix)
                        else:
                            res["suffixes"].append(affix)
                        res["roots"].append(value)
                    elif affix in self.equidistant_patterns:
                        if value == word:
                            if "root_vowel" in affix:
                                res["other"].append(affix)
                            else:
                                res["suffixes"].append(affix)
                            res["roots"].append(key)
        return res
コード例 #12
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def _decompose_by_suffix_family(self, word, res=None):
        """Simple suffix replacements in the complex > simple direction

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)

        for key, value in self.suffix_families.items():
            key_suffix = key
            suffixes = value
            for suffix in suffixes:
                if word.endswith(suffix) and len(word[:-len(suffix)]) > 2:
                    candidate = word[:-len(suffix)] + key_suffix
                    if self.dictionary.is_a_word(candidate):
                        res["suffixes"].append(dash_suffix(suffix))
                        res["roots"].append(candidate)
        return res
コード例 #13
0
ファイル: compounds.py プロジェクト: florianmai/ldt
    def split_on_dash(self, word, res=None):
        """Splitting dashed compounds, attempting to lemmatize all the parts

        Example:
            >>> test_dict.split_on_dash("god-knows-what")
            {'suffixes': [], 'prefixes': [], 'roots': ['god', 'know', 'what'],
            'other': [], 'original_word': 'god-knows-what'}

        Args:
            word (str): a potential nonce-word

        Returns:
            (dict): updated or newly created dictionary with derivational data
        """

        if not res:
            res = _check_res(res)

        if not "-" in word:
            return res

        else:

            words = word.split("-")
            checked = []
            for subword in words:
                lemmas = self.morph_dictionary.lemmatize(subword)
                if lemmas:
                    checked += lemmas
                else:
                    if self.dictionary.is_a_word(subword):
                        checked.append(subword)
        if len(checked) >= len(words):
            res["roots"] += checked
            res["original_word"] = word
        return res
コード例 #14
0
ファイル: affixes.py プロジェクト: florianmai/ldt
    def _decompose_suffix_insertions(self, word, res=None):
        """Decomposing suffixes with insertions before vocalic or consonantal
        suffixes.

        Example:

                >>> test_dict._decompose_suffix_e("imaginable")
                {'suffixes': ['-able'], 'prefixes': [], 'roots': ['imagine'],
                'other': [], 'original_word': []}

        Args:
            word (str): a potential nonce-word
            res (dict): if present, this dictionary will be updated

        Returns:
            (dict): updated or newly created dictionary with derivational data

        """
        res = _check_res(res)

        for suffix in self.suffixes:

            if word.endswith(suffix) and len(word[:-len(suffix)]) > 2:

                if self.is_a_vowel(suffix[0]):
                    insertions = self.insertions_before_vowels
                else:
                    insertions = self.insertions_before_consonants

                for insert in insertions:

                    candidate = word[:-len(suffix)] + insert
                    if self.dictionary.is_a_word(candidate):
                        res["suffixes"].append(dash_suffix(suffix))
                        res["roots"].append(candidate)
        return res