コード例 #1
0
ファイル: huncheck.py プロジェクト: rasimuvaikas/stanza
class Hunchecker():
    """
    Retrieve morphosyntactic information from a hunspell morphological analyser
    """

    def __init__(self, dict_file, dict_data_dir=None):
        self.h = Hunspell(dict_file, hunspell_data_dir=dict_data_dir)
        #h = Hunspell('lt-LT_morphology', hunspell_data_dir='D:/Hunspell-Zodynai-ir-gramatika-v.45')

    def hunspell_to_conll(self, input):
        """
        Retrieve morphosyntactic features of a single word
        :param input:  the word
        :return: a list of lists of possible lemmas (stems), UPOS, XPOS, and UFeats tags
        """

        all = {'Fem': 'mot', 'Masc': 'vyr', 'Neut': 'bev', 'Sg': 'vns', 'Pl': 'dgs', 'Nom': 'V', 'Gen': 'K', 'Dat': 'N',
               'Acc': 'G', 'Inst': 'Įn', 'Loc': 'Vt', 'Loc_short':'Vt', 'Voc': 'Š', 'Il': 'Il', 'Pres': 'es',
               'Past': 'būt-k', 'PastFreq': 'būt-d', 'Fut': 'būs', 'Indic': 'tiesiog', 'Subj': 'tar', 'Imper': 'liep',
               'Nec': 'reik', 'Pass': '******', 'Act': 'veik', 'Part': 'dlv', 'Gerund': 'pad', 'HalfPart': 'pusd',
               'Inf': 'bndr', 'Vadv':'būdn', 'Def': 'įvardž', 'Comp': 'aukšt', 'Super': 'aukšč',
               'I': '1', 'II': '2', 'III': '3'}
        # PROPN, NOUN, VERB, ADJ, ADV, X, ADP, PART
        part_of_speech = {'noun_family_name': 'PROPN', 'noun_proper_name': 'PROPN', 'noun_first_name_substandard': 'PROPN',
               'noun_family_name_substandard': 'PROPN', 'noun_geographic_name': 'PROPN',
               'noun_geographic_name_obscene': 'PROPN', 'noun_proper_name_substandard': 'PROPN',
               'noun_first_name': 'PROPN', 'noun_reflexive': 'NOUN',
               'noun_reflexive_substandard': 'NOUN', 'noun': 'NOUN', 'noun_reflexive_obscene':'NOUN',
               'noun_substandard': 'NOUN', 'noun_obscene': 'NOUN', 'verb_reflexive_substandard':'VERB',
               'verb_reflexive_negative_substandard':'VERB', 'verb_reflexive': 'VERB', 'verb_reflexive_negative':'VERB',
               'verb': 'VERB', 'verb_substandard': 'VERB', 'verb_negative': 'VERB', 'verb_negative_substandard': 'VERB',
               'verb_obscene': 'VERB', 'verb_reflexive_obscene': 'VERB', 'verb_negative_obscene': 'VERB',
               'verb_reflexive_negative_obscene': 'VERB', 'adjective': 'ADJ', 'adjective_substandard': 'ADJ',
                'adjective_obscene': 'ADJ', 'adverb_obscene': 'ADV',
               'adverb': 'ADV', 'adverb_substandard': 'ADV', 'abbreviation_substandard': 'X', 'abbreviation_obscene': 'X',
               'abbreviation':'X', 'acronym':'X', 'acronym_substandard':'X',
                'preposition': 'ADP',
                'particle': 'PART'}

        sngr = ['noun_reflexive', 'noun_reflexive_substandard', 'noun_reflexive_obscene', 'verb_reflexive_substandard',
                'verb_reflexive_negative_substandard', 'verb_reflexive', 'verb_reflexive_negative',
                'verb_reflexive_obscene', 'verb_reflexive_negative_obscene']

        neig = ['verb_reflexive_negative_substandard', 'verb_reflexive_negative', 'verb_negative',
                'verb_negative_substandard', 'verb_negative_obscene', 'verb_reflexive_negative_obscene']

        sutr = ['abbreviation_substandard', 'abbreviation_obscene', 'abbreviation']

        akr = ['acronym', 'acronym_substandard']

        output = self.h.analyze(input)
        if len(output) < 1:
            return None, None, None, None
        poss_outputs = list()
        for o in output[::-1]:
            feats = set()
            for s in o.strip().split():
                if s.startswith('st:'):
                    stem = s[3:]
                elif s.startswith('po:'):
                    if s[3:].split('_')[0] == 'preposition':
                        pos = s[3:].split('_')[0]
                        feats.add(s[3:].split('_')[1])
                    else:
                        pos = s[3:]
                elif s.startswith('is:'):
                    for t in s[3:].split('_'):
                        feats.add(t)
            poss_outputs.append([stem, pos, feats])

        final_output = list()
        for p in poss_outputs:
            xpos = ''
            if p[1] in part_of_speech:
                po = part_of_speech[p[1]]
            else:
                continue
            for f in p[2]:
                if f in all:
                    if f == 'Past':
                        if 'Part' in p[2] and 'Pass' in p[2]:
                            xpos += 'būt.'
                        else:
                            xpos += all[f] + '.'
                    else:
                        xpos += all[f] + '.'
            if p[1] in sngr:
                xpos += 'sngr.'
            if p[1] in neig:
                xpos += 'neig.'
            if po == 'PART':
                features = '_'
                xpos = 'dll.'
            if po == 'ADP':
                p[2] = list(p[2])
                if p[2][0] == 'Inst':
                    p[2] = ['Ins'] 
                features = 'AdpType=Prep|Case=' + p[2][0]
                xpos = 'prl.' + xpos
            if po == 'X':
                features = 'Abbr=Yes'
                if p[1] in akr:
                    xpos = 'akr.'
                if p[1] in sutr:
                    xpos = 'sutr.'
            if po == 'PROPN' and len(p[2]) < 1: # naive assumption
                po = 'X'
                xpos = 'užs.'
                features = 'Foreign=Yes'
            if po == 'NOUN' or po == 'PROPN':
                xpos += 'dktv.'
                if po == 'PROPN':
                    xpos += 'tikr.'
                n = nominalise(input, stem, xpos)
                xpos = noun_to_xpos(n)
                features = xpos_to_feats(n)[1]
            if po == 'ADJ' or po == 'ADV':
                if 'Comp' not in p[2] and 'Super' not in p[2]:
                    xpos += 'nelygin.'
                if po == 'ADJ':
                    xpos += 'bdv.'
                    a = adjectivise(input, stem, xpos)
                    xpos = adj_to_xpos(a)
                    features = xpos_to_feats(a)[1]
                if po == 'ADV':
                    xpos += 'prv.'
                    a = adverbialise(input, stem, xpos)
                    xpos = adv_to_xpos(a)
                    features = xpos_to_feats(a)[1]
            if po == 'VERB':
                xpos2 = None
                if 'Supine' in p[2]:
                    continue
                xpos += 'vksm.'
                if 'Vadv' in p[2]:
                    xpos = 'vksm.' + 'neig.' if p[1] in neig else '' + 'būdn.'
                    features = 'Polarity=' + 'Pos' if p[1] not in neig else 'Neg' + '|VerbForm=Conv'
                    po = 'ADV'
                else:
                    if 'III' in p[2]:
                        xpos2 = xpos
                        xpos += 'vns.'
                        xpos2 += 'dgs.'
                    if 'Indic' in p[2] or 'Subj' in p[2] or 'Imper' in p[2]:
                        xpos += 'asm.'
                        if xpos2:
                            xpos2 += 'asm.'
                        
                    if xpos2:
                        v = verbalise(input, stem, xpos2)
                        xpos2 = verb_to_xpos(v)
                        features = xpos_to_feats(v)[1]
                        final_output.append([stem, po, xpos2, features])
                
                    
                    v = verbalise(input, stem, xpos)
                    xpos = verb_to_xpos(v)
                    features = xpos_to_feats(v)[1]

            final_output.append([stem, po, xpos, features])

        if len(final_output) < 1:
            return None, None, None, None

        else:
            lemma = [x[0] for x in final_output]
            upos = [x[1] for x in final_output]
            xpos = [x[2] for x in final_output]
            feats = [x[3] for x in final_output]
            return lemma, upos, xpos, feats
コード例 #2
0
class Stem:
    """

    The Stem module deals with various tasks, mainly through the following functions:
        - `check_spelling`: spell error detection
        - `correct_spelling`: spell error correction
        - `analyze`: morphological analysis

    Please note that only Sorani is supported in this version in this module. The module is based on the [Kurdish Hunspell project](https://github.com/sinaahmadi/KurdishHunspell).

    Example:
    ```python
    >>> from klpt.stem import Stem
    >>> stemmer = Stem("Sorani", "Arabic")
    >>> stemmer.check_spelling("سوتاندبووت")
    False
    >>> stemmer.correct_spelling("سوتاندبووت")
    (False, ['ستاندبووت', 'سووتاندبووت', 'سووڕاندبووت', 'ڕووتاندبووت', 'فەوتاندبووت', 'بووژاندبووت'])
    >>> stemmer.analyze("دیتبامن")
    [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}]
    ```

    """

    def __init__(self, dialect, script):

        self.dialect = dialect
        self.script = script 

        self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"}
        if self.dialect == "Sorani" and self.script == "Arabic":
            self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/"))
        else:
            if not (self.dialect == "Kurmanji" and self.script == "Latin"):
                raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")

    # def stem(self, word):
    #     """A function for stemming a single word"""
    #     pass

    # def lemmatize(self, word):
    #     """A function for lemmatization of a single word"""
    #     pass

    def check_spelling(self, word):
        """Check spelling of a word

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            bool: True if the spelling is correct, False if the spelling is incorrect
        """
        if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"):
            raise TypeError("Not supported yet.")
        else:
            return self.huns.spell(word)

    def correct_spelling(self, word):
        """
        Correct spelling errors if the input word is incorrect. It returns a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect).
            If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []).
            If no suggestion is available, the list is returned empty as (True, []).

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            tuple (boolean, list)

        """
        if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"):
            raise TypeError("Not supported yet.")
        else:
            if self.check_spelling(word):
                return (True, [])
            return (False, list(self.huns.suggest(word)))

    def analyze(self, word_form):
        """
        Morphological analysis of a given word.
        
        It returns morphological analyses. The morphological analysis is returned as a dictionary as follows:
        
        - "pos": the part-of-speech of the word-form according to [the Universal Dependency tag set](https://universaldependencies.org/u/pos/index.html). 
        - "description": is flag
        - "terminal_suffix": anything except ts flag
        - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure.
        - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to [the Hunspell documentation](http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html), "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base.

        As in [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}]
        If the input cannot be analyzed morphologically, an empty list is returned.

        Sorani: 
        More details regarding Sorani Kurdish morphological analysis can be found at [https://github.com/sinaahmadi/KurdishHunspell](https://github.com/sinaahmadi/KurdishHunspell).

        Kurmanji:
        Regarding Kurmanji, we use the morphological analyzer provided by the [Kurmanji part](https://github.com/apertium/apertium-kmr)

        Please note that there are delicate difference between who the analyzers work in Hunspell and Apertium. For instane, the `base` in the Kurmanji analysis refers to the lemma while in Sorani (from Hunspell), it refers to the morphological base.

        Args:
            word_form (str): a single word-form

        Raises:
            TypeError: only string as input

        Returns:
            (list(dict)): a list of all possible morphological analyses according to the defined morphological rules
            
        """
        if not isinstance(word_form, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            word_analysis = list()
            if self.dialect == "Sorani" and self.script == "Arabic":
                # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary
                for analysis in list(self.huns.analyze(word_form)):
                    analysis_dict = dict()
                    for item in analysis.split():
                        if ":" not in item:
                            continue
                        if item.split(":")[1] == "ts":
                            # ts flag exceptionally appears after the value as value:key in the Hunspell output
                            analysis_dict["base"] = item.split(":")[0]
                            # anything except the terminal_suffix is considered to be the base
                            analysis_dict[self.hunspell_flags[item.split(":")[1]]] = word_form.replace(item.split(":")[0], "")
                        elif item.split(":")[0] in self.hunspell_flags.keys():
                            # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function
                            # for ds flag, add derivation as the formation type, otherwise inflection
                            if item.split(":")[0] == "ds":
                                analysis_dict[self.hunspell_flags[item.split(":")[0]]] = "derivational"
                                analysis_dict[self.hunspell_flags["is"]] = item.split(":")[1]
                            else:
                                analysis_dict[self.hunspell_flags[item.split(":")[0]]] = item.split(":")[1]

                    # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0
                    if self.hunspell_flags["ts"] not in analysis_dict or analysis_dict[self.hunspell_flags["ts"]] == "":
                        analysis_dict[self.hunspell_flags["ts"]] = "0"

                    word_analysis.append(analysis_dict)

            elif self.dialect == "Kurmanji" and self.script == "Latin":
                att_analysis = Analysis("Kurmanji", "Latin").analyze(word_form)
                # check if the word-form is analyzed or no
                if not len(att_analysis[1]):
                    # the word-form could not be analyzed
                    return []

                for form_analysis in list(att_analysis[-1]):
                    for analysis in form_analysis:
                        analysis_dict = dict()
                        structure = analysis[0].rsplit('@', 1)[1].split("<", 1)
                        analysis_dict["base"], analysis_dict["description"] = structure[0], structure[1].replace("><", "_").replace(">", "").strip()
                        analysis_dict["pos"] = ""
                        analysis_dict["terminal_suffix"] = ""
                        analysis_dict["formation"] = ""
                        # TODO: the description needs further information extraction in such a way that some values should be assigned to the "pos" key 
                        # analysis_dict["terminal_suffix"] = word_form.replace(analysis_dict["base"], "")
                        word_analysis.append(analysis_dict)

        return word_analysis
コード例 #3
0
class Stem():
    """The Stem class deals with various tasks as follows:
        - spell error detection and correction
        - morphological analysis
        - stemming

        These tasks are carried out in the `Kurdish Hunspell project <https://github.com/sinaahmadi/KurdishHunspell>`_.

    """
    def __init__(self, dialect, script):
        self.hunspell_flags = {
            "po": "pos",
            "is": "description",
            "ts": "terminal_suffix",
            "ds": "formation"
        }
        if dialect == "Sorani" and script == "Arabic":
            self.huns = Hunspell("ckb-Arab",
                                 hunspell_data_dir=klpt.get_data("data/"))
        else:
            raise Exception(
                "Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!"
            )

    # def stem(self, word):
    #     """A function for stemming a single word"""
    #     pass

    # def lemmatize(self, word):
    #     """A function for lemmatization of a single word"""
    #     pass

    def check_spelling(self, word):
        """Check spelling of a word

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            bool: True if the spelling is correct, False if the spelling is incorrect
        """
        if not isinstance(word, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            return self.huns.spell(word)

    def correct_spelling(self, word):
        """Correct spelling errors if the input word is incorrect

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            tuple (boolean, list): a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect).
            If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []).
            If no suggestion is available, the list is returned empty as (True, []).
        """
        if not isinstance(word, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            if self.check_spelling(word):
                return (True, [])
            return (False, list(self.huns.suggest(word)))

    def analyze(self, word_form):
        """Morphological analysis of a given word
        More details regarding Kurdish morphological analysis can be found at https://github.com/sinaahmadi/KurdishHunspell

        Args:
            word_form (str): a single word-form

        Raises:
            TypeError: only string as input

        Returns:
            (list(dict)): a list of all possible morphological analyses according to the defined morphological rules
            
            The morphological analysis is returned as a dictionary as follows:
             - "pos": the part-of-speech of the word-form according to `the Universal Dependency tag set <https://universaldependencies.org/u/pos/index.html>`_ 
             - "description": is flag
             - "terminal_suffix": anything except ts flag
             - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure.
             - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to `the Hunspell documentation <http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html>`_, "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base.

            If the input cannot be analyzed morphologically, an empty list is returned.
        """
        if not isinstance(word_form, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary
            word_analysis = list()
            for analysis in list(self.huns.analyze(word_form)):
                analysis_dict = dict()
                for item in analysis.split():
                    if ":" not in item:
                        continue
                    if item.split(":")[1] == "ts":
                        # ts flag exceptionally appears after the value as value:key in the Hunspell output
                        analysis_dict["base"] = item.split(":")[0]
                        # anything except the terminal_suffix is considered to be the base
                        analysis_dict[self.hunspell_flags[item.split(
                            ":")[1]]] = word_form.replace(
                                item.split(":")[0], "")
                    elif item.split(":")[0] in self.hunspell_flags.keys():
                        # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function
                        # for ds flag, add derivation as the formation type, otherwise inflection
                        if item.split(":")[0] == "ds":
                            analysis_dict[self.hunspell_flags[item.split(
                                ":")[0]]] = "derivational"
                            analysis_dict[
                                self.hunspell_flags["is"]] = item.split(":")[1]
                        else:
                            analysis_dict[self.hunspell_flags[item.split(
                                ":")[0]]] = item.split(":")[1]

                # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0
                if self.hunspell_flags[
                        "ts"] not in analysis_dict or analysis_dict[
                            self.hunspell_flags["ts"]] == "":
                    analysis_dict[self.hunspell_flags["ts"]] = "0"

                word_analysis.append(analysis_dict)

        return word_analysis