Example #1
0
    def _expectation_satisfied(cls, phonetic_expectation, form_str):
        if phonetic_expectation == PhoneticExpectation.VowelStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(
                    phonetic_expectation,
                    form_str[1:]) or cls._expectation_satisfied(
                        phonetic_expectation, form_str[2:])
            else:
                return TurkishAlphabet.get_letter_for_char(first_char).vowel

        elif phonetic_expectation == PhoneticExpectation.ConsonantStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(
                    phonetic_expectation,
                    form_str[1:]) or cls._expectation_satisfied(
                        phonetic_expectation, form_str[2:])
            else:
                return not TurkishAlphabet.get_letter_for_char(
                    first_char).vowel

        else:
            raise Exception('Unknown phonetic_expectation',
                            phonetic_expectation)
    def _get_voicing_and_doubling_roots(self, partial_input, last_char, first_char_after_partial_input,
                                        no_orthographics_root):
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)
        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input)

        no_voicing_rule_applies = last_letter in TurkishAlphabet.Voicing_Map and first_letter_after_partial_input.vowel
        voicing_might_have_happened = last_letter in TurkishAlphabet.Inverse_Voicing_Map and first_letter_after_partial_input.vowel
        doubling_might_have_happened = len(partial_input) > 2 and\
                                       not last_letter.vowel and\
                                       partial_input[-1] == partial_input[-2] and\
                                       first_letter_after_partial_input.vowel

        if doubling_might_have_happened:
            if no_voicing_rule_applies:
                doubling_root = self._create_doubling_root(no_orthographics_root, last_char)
                no_orthographics_root.lexeme.attributes = {LexemeAttribute.NoVoicing}
                doubling_root.lexeme.attributes.add(LexemeAttribute.NoVoicing)
                return [no_orthographics_root, doubling_root]
            elif voicing_might_have_happened:
                inverse_devoicing_roots = self._inverse_devoice_last_letter(no_orthographics_root, last_letter)
                devoicing_doubling_roots = [self._create_doubling_root(r, r.lexeme.root[-1]) for r in
                                            inverse_devoicing_roots]
                doubling_root = self._create_doubling_root(no_orthographics_root, last_char)
                return [no_orthographics_root] + [doubling_root] + devoicing_doubling_roots
            else:
                return [no_orthographics_root] + [self._create_doubling_root(no_orthographics_root, last_char)]
        else:
            if no_voicing_rule_applies:
                no_orthographics_root.lexeme.attributes = {LexemeAttribute.NoVoicing}
                return [no_orthographics_root]
            elif voicing_might_have_happened:
                return [no_orthographics_root] + self._inverse_devoice_last_letter(no_orthographics_root, last_letter)
            else:
                return [no_orthographics_root]
Example #3
0
    def application_matches(cls, word, applied_str, voicing_allowed):
        """
        Checks if a suffix applied word is matched by a surface.

            >>> Phonetics.application_matches(u'armudunu', u'armut', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armut', False)
            False
            >>> Phonetics.application_matches(u'armudunu', u'armudu', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armudu', False)
            True

        @param word: The full word (surface)
        @param applied_str: Suffix applied part of the word
        @param voicing_allowed: If voicing should be considered or ignored
        @type word: unicode
        @type applied_str: unicode
        @type voicing_allowed: bool
        @rtype: L{bool}
        """
        if not applied_str or len(applied_str) > len(word):
            return False

        elif word == applied_str or word.startswith(applied_str):
            return True

        if  voicing_allowed and word.startswith(applied_str[:-1]):
            last_letter_of_application = TurkishAlphabet.get_letter_for_char(applied_str[-1])
            last_letter_of_word_part = TurkishAlphabet.get_letter_for_char(word[len(applied_str) - 1])
            return TurkishAlphabet.voice(last_letter_of_application) == last_letter_of_word_part

        else:
            return False
Example #4
0
    def apply(cls,
              word,
              phonetic_attributes,
              form_str,
              lexeme_attributes=None):
        """
        Applies a suffix form to a word, considering the phonetics and root attributes given.
        @param word: Surface
        @type word: unicode
        @param phonetic_attributes: Provided phonetics of the surface
        @type phonetic_attributes: set of unicode
        @param form_str: Suffix form
        @type form_str: unicode
        @param lexeme_attributes: Provided lexeme attributes of the root of surface
        @type lexeme_attributes: set of unicode
        @return: Tuple (word, applied suffix form)
        @rtype: tuple
        """
        if not form_str or not form_str.strip():
            return word, u''

        if not word or not word.strip():
            return None, None

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.apply(word, phonetic_attributes, form_str[2:],
                                     lexeme_attributes)
                else:
                    # yap, kitap
                    return cls._handle_phonetics(word, phonetic_attributes,
                                                 form_str[1:],
                                                 lexeme_attributes)

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return cls._handle_phonetics(word, phonetic_attributes,
                                                 form_str[1:],
                                                 lexeme_attributes)
                else:
                    # yap, kitap
                    return cls.apply(word, phonetic_attributes, form_str[2:],
                                     lexeme_attributes)

        else:
            return cls._handle_phonetics(word, phonetic_attributes, form_str,
                                         lexeme_attributes)
Example #5
0
    def _seems_like_a_valid_verb_root(self, seq):
        last_char = seq[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        previous_char = seq[-2]
        previous_letter = TurkishAlphabet.get_letter_for_char(previous_char)

        return last_letter.vowel or previous_letter.vowel or\
               (any([previous_letter == l for l in [TurkishAlphabet.L_l, TurkishAlphabet.L_r, TurkishAlphabet.L_n]])
                and not last_letter.continuant)
Example #6
0
    def _seems_like_a_valid_verb_root(self, seq):
        last_char = seq[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        previous_char = seq[-2]
        previous_letter = TurkishAlphabet.get_letter_for_char(previous_char)

        return last_letter.vowel or previous_letter.vowel or\
               (any([previous_letter == l for l in [TurkishAlphabet.L_l, TurkishAlphabet.L_r, TurkishAlphabet.L_n]])
                and not last_letter.continuant)
Example #7
0
    def is_suffix_form_applicable(cls, word, form_str):
        """
        Calculates the phonetics of the word and a suffix for and determines if the suffix form is applicable.
        @type word: unicode or None
        @type form_str: unicode or None
        @rtype: bool
        """
        if not form_str or not form_str.strip():
            return True

        if not word or not word.strip():
            return False

        word = word.strip()
        form_str = form_str.strip()

        phonetic_attributes = cls.calculate_phonetic_attributes_of_plain_sequence(
            word)

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.is_suffix_form_applicable(word, form_str[2:])
                else:
                    # yap, kitap
                    return True

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return True
                else:
                    # yap, kitap
                    return cls.is_suffix_form_applicable(word, form_str[2:])

        else:
            if first_form_letter.vowel:
                return PhoneticAttributes.LastLetterVowel not in phonetic_attributes
            else:
                return True
Example #8
0
    def is_suffix_form_applicable(cls, word, form_str):
        """
        Calculates the phonetics of the word and a suffix for and determines if the suffix form is applicable.
        @type word: unicode or None
        @type form_str: unicode or None
        @rtype: bool
        """
        if not form_str or not form_str.strip():
            return True

        if not word or not word.strip():
            return False

        word = word.strip()
        form_str = form_str.strip()

        phonetic_attributes = cls.calculate_phonetic_attributes_of_plain_sequence(word)

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.is_suffix_form_applicable(word, form_str[2:])
                else:
                    # yap, kitap
                    return True

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return True
                else:
                    # yap, kitap
                    return cls.is_suffix_form_applicable(word, form_str[2:])

        else:
            if first_form_letter.vowel:
                return PhoneticAttributes.LastLetterVowel not in phonetic_attributes
            else:
                return True
Example #9
0
    def _get_first_vowel(self, seq):
        for s in seq:
            letter = TurkishAlphabet.get_letter_for_char(s)
            if letter and letter.vowel:
                return letter

        return None
Example #10
0
    def _vowel_count(cls, seq):
        vowel_count = 0
        for c in seq:
            if TurkishAlphabet.get_letter_for_char(c).vowel:
                vowel_count += 1

        return vowel_count
Example #11
0
    def _vowel_count(cls, seq):
        vowel_count = 0
        for c in seq:
            if TurkishAlphabet.get_letter_for_char(c).vowel:
                vowel_count += 1

        return vowel_count
Example #12
0
    def _get_first_vowel(self, seq):
        for s in seq:
            letter = TurkishAlphabet.get_letter_for_char(s)
            if letter and letter.vowel:
                return letter

        return None
Example #13
0
    def apply(cls, word, phonetic_attributes, form_str, lexeme_attributes=None):
        """
        Applies a suffix form to a word, considering the phonetics and root attributes given.
        @param word: Surface
        @type word: unicode
        @param phonetic_attributes: Provided phonetics of the surface
        @type phonetic_attributes: set of unicode
        @param form_str: Suffix form
        @type form_str: unicode
        @param lexeme_attributes: Provided lexeme attributes of the root of surface
        @type lexeme_attributes: set of unicode
        @return: Tuple (word, applied suffix form)
        @rtype: tuple
        """
        if not form_str or not form_str.strip():
            return word, u''

        if not word or not word.strip():
            return None, None

        # ci, dik, +yacak, +iyor, +ar, +yi, +im, +yla

        first_form_letter = TurkishAlphabet.get_letter_for_char(form_str[0])
        if first_form_letter.char_value == '+':
            # +yacak, +iyor, +ar, +yi, +im, +yla

            optional_letter = TurkishAlphabet.get_letter_for_char(form_str[1])
            if optional_letter.vowel:
                #+iyor, +ar, +im
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    # ata, dana
                    return cls.apply(word, phonetic_attributes, form_str[2:], lexeme_attributes)
                else:
                    # yap, kitap
                    return cls._handle_phonetics(word, phonetic_attributes, form_str[1:], lexeme_attributes)

            else:
                # +yacak, +yi, +yla
                if PhoneticAttributes.LastLetterVowel in phonetic_attributes:
                    #ata, dana
                    return cls._handle_phonetics(word, phonetic_attributes, form_str[1:], lexeme_attributes)
                else:
                    # yap, kitap
                    return cls.apply(word, phonetic_attributes, form_str[2:], lexeme_attributes)

        else:
            return cls._handle_phonetics(word, phonetic_attributes, form_str, lexeme_attributes)
Example #14
0
    def _infer_morphemic_attributes(cls, lexeme):
        """
        @type lexeme: Lexeme
        """
        item_root = lexeme.root
        root_vowel_count = cls._vowel_count(item_root)
        last_letter = TurkishAlphabet.get_letter_for_char(item_root[-1])

        if lexeme.syntactic_category==SyntacticCategory.VERB:
            if last_letter.vowel:
                lexeme.attributes.add(LexemeAttribute.ProgressiveVowelDrop)
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if root_vowel_count>1 and LexemeAttribute.Aorist_A not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_I)

            if root_vowel_count==1 and LexemeAttribute.Aorist_I not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_A)

            if last_letter==TurkishAlphabet.L_l:
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if all(a not in lexeme.attributes for a in LexemeAttribute.CAUSATIVES):
                if last_letter.vowel or (last_letter in [TurkishAlphabet.L_l, TurkishAlphabet.L_r]) and root_vowel_count>1:
                    lexeme.attributes.add(LexemeAttribute.Causative_t)
                elif last_letter==TurkishAlphabet.L_t and root_vowel_count<2:
                    lexeme.attributes.add(LexemeAttribute.Causative_Ir)
                else:
                    lexeme.attributes.add(LexemeAttribute.Causative_dIr)

            if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

            if LexemeAttribute.Voicing not in lexeme.attributes and LexemeAttribute.NoVoicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category==SyntacticCategory.NOUN and LexemeAttribute.CompoundP3sg in lexeme.attributes:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            elif LexemeAttribute.Voicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category in [SyntacticCategory.NOUN, SyntacticCategory.ADJECTIVE]:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            else:
                if root_vowel_count>1 and last_letter.voiceless and not last_letter.continuant and LexemeAttribute.NoVoicing not in lexeme.attributes \
                and LexemeAttribute.InverseHarmony not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif item_root.endswith('nk') or item_root.endswith('og') or item_root.endswith('rt'):
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif LexemeAttribute.Voicing not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.NoVoicing)
Example #15
0
    def parse(self, input):
        parse_results = super(
            UpperCaseSupportingContextlessMorphologicalParser,
            self).parse(input)
        if input[0].isupper():
            parse_results += super(
                UpperCaseSupportingContextlessMorphologicalParser,
                self).parse(TurkishAlphabet.lower(input[0]) + input[1:])

        return parse_results
Example #16
0
    def _expectation_satisfied(cls, phonetic_expectation, form_str):
        if phonetic_expectation == PhoneticExpectation.VowelStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(phonetic_expectation, form_str[1:]) or cls._expectation_satisfied(
                    phonetic_expectation, form_str[2:])
            else:
                return TurkishAlphabet.get_letter_for_char(first_char).vowel

        elif phonetic_expectation == PhoneticExpectation.ConsonantStart:
            first_char = form_str[0]
            if first_char == '+':
                return cls._expectation_satisfied(phonetic_expectation, form_str[1:]) or cls._expectation_satisfied(
                    phonetic_expectation, form_str[2:])
            else:
                return not TurkishAlphabet.get_letter_for_char(first_char).vowel

        else:
            raise Exception('Unknown phonetic_expectation', phonetic_expectation)
def print_verbs_with_double_consonant_ending():
    dictionary_file_path = os.path.join(os.path.dirname(__file__),
                                        '../resources/master_dictionary.txt')
    with codecs.open(dictionary_file_path, mode='r',
                     encoding='utf-8') as dictionary_file:
        for line in dictionary_file:
            line = line.strip()
            if line.startswith('#'):
                continue
            item = line
            if u'[' in line:
                item, meta = line.split(u'[')
            item = item.strip()
            if item.endswith(u'mak') or item.endswith(u'mek'):
                verb_root = item[:-3]
                if not TurkishAlphabet.get_letter_for_char(
                        verb_root[-1]
                ).vowel and not TurkishAlphabet.get_letter_for_char(
                        verb_root[-2]).vowel:
                    print verb_root
Example #18
0
    def _handle_phonetics(cls,
                          word,
                          phonetic_attributes,
                          form_str,
                          lexeme_attributes=None):
        lexeme_attributes = lexeme_attributes or []
        phonetic_attributes = phonetic_attributes or []

        first_letter_of_form = TurkishAlphabet.get_letter_for_char(form_str[0])

        # first apply voicing if possible
        if LexemeAttribute.NoVoicing not in lexeme_attributes and PhoneticAttributes.LastLetterVoicelessStop in phonetic_attributes and first_letter_of_form.vowel:
            voiced_letter = TurkishAlphabet.voice(
                TurkishAlphabet.get_letter_for_char(word[-1]))
            if voiced_letter:
                word = word[:-1] + voiced_letter.char_value

        # then try devoicing
        if PhoneticAttributes.LastLetterVoiceless in phonetic_attributes and TurkishAlphabet.devoice(
                first_letter_of_form):
            form_str = TurkishAlphabet.devoice(
                first_letter_of_form).char_value + form_str[1:]

        applied = u''

        for i in range(len(form_str)):
            c = form_str[i]
            next_c = form_str[i + 1] if i + 1 < len(form_str) else None

            if c == '!':
                continue

            letter = TurkishAlphabet.get_letter_for_char(c)
            if letter.vowel and letter.upper_case_char_value == c:
                if c == u'A':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'a'
                    else:
                        applied += u'e'
                elif c == u'I':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'ı'
                        else:
                            applied += u'u'
                    else:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'i'
                        else:
                            applied += u'ü'
                elif c == u'O':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'o'
                    else:
                        applied += u'ö'

            else:
                applied = applied + c

        return word, applied
Example #19
0
    def __init__(self, abbr):
        root = abbr
        lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN, SecondarySyntacticCategory.ABBREVIATION, None)
        phonetic_attributes = None

        last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1])
        if last_letter.vowel:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr)
        else:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(abbr + u"E")

        phonetic_expectations = None
        super(AbbreviationRoot, self).__init__(root, lexeme, phonetic_expectations, phonetic_attributes)
Example #20
0
    def application_matches(cls, word, applied_str, voicing_allowed):
        """
        Checks if a suffix applied word is matched by a surface.

            >>> Phonetics.application_matches(u'armudunu', u'armut', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armut', False)
            False
            >>> Phonetics.application_matches(u'armudunu', u'armudu', True)
            True
            >>> Phonetics.application_matches(u'armudunu', u'armudu', False)
            True

        @param word: The full word (surface)
        @param applied_str: Suffix applied part of the word
        @param voicing_allowed: If voicing should be considered or ignored
        @type word: unicode
        @type applied_str: unicode
        @type voicing_allowed: bool
        @rtype: L{bool}
        """
        if not applied_str or len(applied_str) > len(word):
            return False

        elif word == applied_str or word.startswith(applied_str):
            return True

        if voicing_allowed and word.startswith(applied_str[:-1]):
            last_letter_of_application = TurkishAlphabet.get_letter_for_char(
                applied_str[-1])
            last_letter_of_word_part = TurkishAlphabet.get_letter_for_char(
                word[len(applied_str) - 1])
            return TurkishAlphabet.voice(
                last_letter_of_application) == last_letter_of_word_part

        else:
            return False
Example #21
0
def print_verbs_with_double_consonant_ending():
    dictionary_file_path = os.path.join(os.path.dirname(__file__), '../resources/master_dictionary.txt')
    with codecs.open(dictionary_file_path, mode='r', encoding='utf-8') as dictionary_file:
        for line in dictionary_file:
            line = line.strip()
            if line.startswith('#'):
                continue
            item = line
            if u'[' in line:
                item,meta = line.split(u'[')
            item = item.strip()
            if item.endswith(u'mak') or item.endswith(u'mek'):
                verb_root = item[:-3]
                if not TurkishAlphabet.get_letter_for_char(verb_root[-1]).vowel and not TurkishAlphabet.get_letter_for_char(verb_root[-2]).vowel:
                    print verb_root
    def save_parse_result_for_word(self, word_id, parse_result_uuid):
        """
        @type word_id: ObjectId
        @type parse_result_uuid: str or unicode
        """
        parse_result = self.sessionmanager.get_parse_result(parse_result_uuid)
        assert parse_result, "No parse result found with id {}".format(parse_result)

        word = self.dbmanager.get_word(word_id)
        if not word:
            raise Exception("Word not found for setting the correct parse result! {}".format(word_id))

        # check if the parse result belongs to the given word
        assert word['surface'] == parse_result.get_surface() or TurkishAlphabet.lower(word['surface']) == parse_result.get_surface()

        self.dbmanager.set_parse_result_for_word(word, formatter.format_morpheme_container_for_parseset(parse_result), parse_result)
Example #23
0
    def _handle_phonetics(cls, word, phonetic_attributes, form_str, lexeme_attributes=None):
        lexeme_attributes = lexeme_attributes or []
        phonetic_attributes = phonetic_attributes or []

        first_letter_of_form = TurkishAlphabet.get_letter_for_char(form_str[0])

        # first apply voicing if possible
        if LexemeAttribute.NoVoicing not in lexeme_attributes and PhoneticAttributes.LastLetterVoicelessStop in phonetic_attributes and first_letter_of_form.vowel:
            voiced_letter = TurkishAlphabet.voice(TurkishAlphabet.get_letter_for_char(word[-1]))
            if voiced_letter:
                word = word[:-1] + voiced_letter.char_value

        # then try devoicing
        if PhoneticAttributes.LastLetterVoiceless in phonetic_attributes and TurkishAlphabet.devoice(first_letter_of_form):
            form_str = TurkishAlphabet.devoice(first_letter_of_form).char_value + form_str[1:]

        applied = u''

        for i in range(len(form_str)):
            c = form_str[i]
            next_c = form_str[i + 1] if i + 1 < len(form_str) else None

            if c == '!':
                continue

            letter = TurkishAlphabet.get_letter_for_char(c)
            if letter.vowel and letter.upper_case_char_value == c:
                if c == u'A':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'a'
                    else:
                        applied += u'e'
                elif c == u'I':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'ı'
                        else:
                            applied += u'u'
                    else:
                        if PhoneticAttributes.LastVowelUnrounded in phonetic_attributes or next_c == '!':
                            applied += u'i'
                        else:
                            applied += u'ü'
                elif c == u'O':
                    if PhoneticAttributes.LastVowelBack in phonetic_attributes:
                        applied += u'o'
                    else:
                        applied += u'ö'

            else:
                applied = applied + c

        return word, applied
Example #24
0
    def __init__(self, abbr):
        root = abbr
        lexeme = DynamicLexeme(abbr, abbr, SyntacticCategory.NOUN,
                               SecondarySyntacticCategory.ABBREVIATION, None)
        phonetic_attributes = None

        last_letter = TurkishAlphabet.get_letter_for_char(abbr[-1])
        if last_letter.vowel:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
                abbr)
        else:
            phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
                abbr + u'E')

        phonetic_expectations = None
        super(AbbreviationRoot,
              self).__init__(root, lexeme, phonetic_expectations,
                             phonetic_attributes)
Example #25
0
    def calculate_phonetic_attributes_of_plain_sequence(cls, seq):
        """
        Calculates the phonetic attributes of a word, without the root attributes of it.
        @type seq: unicode
        @rtype: set
        """
        attrs = []

        last_vowel = cls.get_last_vowel(seq)
        last_letter = TurkishAlphabet.get_letter_for_char(seq[-1])
        if last_vowel:
            if last_vowel.rounded:
                attrs.append(PhoneticAttributes.LastVowelRounded)
            else:
                attrs.append(PhoneticAttributes.LastVowelUnrounded)

            if last_vowel.frontal:
                attrs.append(PhoneticAttributes.LastVowelFrontal)
            else:
                attrs.append(PhoneticAttributes.LastVowelBack)

        if last_letter.vowel:
            attrs.append(PhoneticAttributes.LastLetterVowel)
        else:
            attrs.append(PhoneticAttributes.LastLetterConsonant)

        if last_letter.voiceless:
            attrs.append(PhoneticAttributes.LastLetterVoiceless)
            if not last_letter.continuant:
                attrs.append(PhoneticAttributes.LastLetterVoicelessStop)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotVoiceless)
            if not last_letter.continuant and not last_letter.vowel:
                attrs.append(PhoneticAttributes.LastLetterVoicedStop)

        if last_letter.continuant:
            attrs.append(PhoneticAttributes.LastLetterContinuant)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotContinuant)

        return set(attrs)
Example #26
0
    def calculate_phonetic_attributes_of_plain_sequence(cls, seq):
        """
        Calculates the phonetic attributes of a word, without the root attributes of it.
        @type seq: unicode
        @rtype: set
        """
        attrs = []

        last_vowel = cls.get_last_vowel(seq)
        last_letter = TurkishAlphabet.get_letter_for_char(seq[-1])
        if last_vowel:
            if last_vowel.rounded:
                attrs.append(PhoneticAttributes.LastVowelRounded)
            else:
                attrs.append(PhoneticAttributes.LastVowelUnrounded)

            if last_vowel.frontal:
                attrs.append(PhoneticAttributes.LastVowelFrontal)
            else:
                attrs.append(PhoneticAttributes.LastVowelBack)

        if last_letter.vowel:
            attrs.append(PhoneticAttributes.LastLetterVowel)
        else:
            attrs.append(PhoneticAttributes.LastLetterConsonant)

        if last_letter.voiceless:
            attrs.append(PhoneticAttributes.LastLetterVoiceless)
            if not last_letter.continuant:
                attrs.append(PhoneticAttributes.LastLetterVoicelessStop)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotVoiceless)
            if not last_letter.continuant and not last_letter.vowel:
                attrs.append(PhoneticAttributes.LastLetterVoicedStop)

        if last_letter.continuant:
            attrs.append(PhoneticAttributes.LastLetterContinuant)
        else:
            attrs.append(PhoneticAttributes.LastLetterNotContinuant)

        return set(attrs)
Example #27
0
    def create_word_binding_from_morpheme_container(self, word_str, morpheme_container):
        assert (word_str == morpheme_container.get_surface_so_far()) or (TurkishAlphabet.lower(word_str[0])+word_str[1:] == morpheme_container.get_surface_so_far())

        root_str = morpheme_container.get_root().str
        lemma = morpheme_container.get_root().lexeme.lemma
        lemma_root = morpheme_container.get_root().lexeme.root
        root_syntactic_category = morpheme_container.get_root().lexeme.syntactic_category
        root_secondary_syntactic_category = morpheme_container.get_root().lexeme.secondary_syntactic_category
        root = RootBinding(root_str, lemma, lemma_root, root_syntactic_category, root_secondary_syntactic_category)

        word_syntactic_category = morpheme_container.get_surface_syntactic_category()
        word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category()

        parse_result = formatter.format_morpheme_container_for_parseset(morpheme_container)
        word = WordBinding(word_str, parse_result, root, word_syntactic_category, word_secondary_syntactic_category)

        if morpheme_container.get_transitions():
            so_far = root_str
            for transition in morpheme_container.get_transitions():
                if isinstance(transition.suffix_form_application.suffix_form.suffix, FreeTransitionSuffix):
                    continue

                suffix_name = transition.suffix_form_application.suffix_form.suffix.name
                suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name
                suffix_form = transition.suffix_form_application.suffix_form.form
                suffix_application = transition.suffix_form_application.fitting_suffix_form
                suffix_actual_application = transition.suffix_form_application.actual_suffix_form
                word_with_suffix_application = None
                if (so_far + suffix_actual_application)==root_str:
                    word_with_suffix_application = morpheme_container.get_root().lexeme.root + suffix_application
                else:
                    word_with_suffix_application = so_far + suffix_application
                so_far += suffix_actual_application
                if transition.is_derivational():
                    suffix = DerivationalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
                else:
                    suffix = InflectionalSuffixBinding(suffix_name, suffix_pretty_name, suffix_form, suffix_application, suffix_actual_application, word_with_suffix_application, so_far, transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
        return word
    def save_parse_result_for_word(self, word_id, parse_result_uuid):
        """
        @type word_id: ObjectId
        @type parse_result_uuid: str or unicode
        """
        parse_result = self.sessionmanager.get_parse_result(parse_result_uuid)
        assert parse_result, "No parse result found with id {}".format(
            parse_result)

        word = self.dbmanager.get_word(word_id)
        if not word:
            raise Exception(
                "Word not found for setting the correct parse result! {}".
                format(word_id))

        # check if the parse result belongs to the given word
        assert word['surface'] == parse_result.get_surface(
        ) or TurkishAlphabet.lower(
            word['surface']) == parse_result.get_surface()

        self.dbmanager.set_parse_result_for_word(
            word,
            formatter.format_morpheme_container_for_parseset(parse_result),
            parse_result)
Example #29
0
 def get_last_vowel(cls, seq):
     for s in reversed(seq):
         turkish_letter = TurkishAlphabet.get_letter_for_char(s)
         if turkish_letter.vowel:
             return turkish_letter
Example #30
0
    def find_roots_for_partial_input(self, partial_input, whole_surface=None):
        """
        @type partial_input: unicode
        @type whole_surface: unicode
        @rtype: list of Root
        """
        assert partial_input and whole_surface
        assert len(partial_input) <= len(whole_surface)
        assert whole_surface.startswith(partial_input)
        if len(whole_surface) == len(partial_input):
            assert whole_surface == partial_input

        if len(partial_input) < 2:      # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary
            return []

        last_vowel = Phonetics.get_last_vowel(partial_input)

        if not last_vowel:
            return []

        root = partial_input
        lemma = root
        lemma_root = lemma
        syntactic_category = SyntacticCategory.VERB
        secondary_syntactic_category = None
        lexeme_attributes = set()

        lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category, secondary_syntactic_category,
            lexeme_attributes)

        phonetic_expectations = set()
        phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(partial_input)

        no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations, phonetic_attributes)

        self._set_lexeme_and_phonetic_attributes([no_attr_root])
        self._set_lemma([no_attr_root])

        last_char = partial_input[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root(partial_input)

        if whole_surface==partial_input:
            return [no_attr_root] if partial_surface_can_be_root_of_a_verb else []


        first_char_after_partial_input = whole_surface[len(partial_input)]

        if first_char_after_partial_input.isupper():
            return []

        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input)


        might_have_ProgressiveVowelDrop = not last_letter.vowel and\
                                          any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']])

        might_have_Aorist_A = not last_letter.vowel and \
                              (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er'))

        # no Aorist_I for -ur, -ür
        might_have_Aorist_I = not last_letter.vowel and\
                              (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir'))

        # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker}
        voicing_might_have_happened = last_letter==TurkishAlphabet.L_d and first_letter_after_partial_input.vowel

        possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots(partial_input, whole_surface, no_attr_root, last_vowel) if might_have_ProgressiveVowelDrop else set()
        possible_aorist_A_roots = self._get_aorist_A_roots(no_attr_root) if might_have_Aorist_A else set()
        possible_aorist_I_roots = self._get_aorist_I_roots(no_attr_root) if might_have_Aorist_I else set()
        possible_causative_roots = self._get_possible_causative_roots(partial_input, whole_surface, no_attr_root)
        possible_passive_roots = self._get_possible_passive_roots(last_letter, partial_input, whole_surface, no_attr_root)


        if voicing_might_have_happened:
            possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union(set([self._get_possible_voicing_root(r) for r in possible_progressive_vowel_drop_roots]))
            possible_aorist_A_roots = possible_aorist_A_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_A_roots]))
            possible_aorist_I_roots = possible_aorist_I_roots.union(set([self._get_possible_voicing_root(r) for r in possible_aorist_I_roots]))
            possible_causative_roots = possible_causative_roots.union(set([self._get_possible_voicing_root(r) for r in possible_causative_roots]))
            possible_passive_roots = possible_passive_roots.union(set([self._get_possible_voicing_root(r) for r in possible_passive_roots]))

        generated_roots = set()

        generated_roots.add(no_attr_root)

        if voicing_might_have_happened:
            generated_roots.add(self._get_possible_voicing_root(no_attr_root))

        generated_roots = generated_roots.union(possible_progressive_vowel_drop_roots)
        generated_roots = generated_roots.union(possible_aorist_A_roots)
        generated_roots = generated_roots.union(possible_aorist_I_roots)
        generated_roots = generated_roots.union(possible_causative_roots)
        generated_roots = generated_roots.union(possible_passive_roots)

        self._set_lexeme_and_phonetic_attributes(generated_roots)
        self._set_lemma(generated_roots)

        generated_roots = list(generated_roots)

        generated_roots = filter(lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root), generated_roots)

        return generated_roots
Example #31
0
    def parse(self, input):
        parse_results = super(UpperCaseSupportingContextlessMorphologicalParser, self).parse(input)
        if input[0].isupper():
            parse_results += super(UpperCaseSupportingContextlessMorphologicalParser, self).parse(TurkishAlphabet.lower(input[0]) + input[1:])

        return parse_results
Example #32
0
 def get_last_vowel(cls, seq):
     for s in reversed(seq):
         turkish_letter = TurkishAlphabet.get_letter_for_char(s)
         if turkish_letter.vowel:
             return turkish_letter
    def find_roots_for_partial_input(self, partial_input, whole_surface=None):
        """
        @type partial_input: unicode
        @type whole_surface: unicode
        @rtype: list of Root
        """
        assert partial_input and whole_surface
        assert len(partial_input) <= len(whole_surface)
        assert whole_surface.startswith(partial_input)
        if len(whole_surface) == len(partial_input):
            assert whole_surface == partial_input

        # no compound should be found an input shorter than sth like "atsu-yu". even that doesn't make sense
        if len(partial_input) < 5:
            return []

        if whole_surface == partial_input:
            return []

        last_char = partial_input[-1]
        previous_char = partial_input[-2]

        if last_char.isupper() or previous_char.isupper():
            return []

        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        if last_letter!=TurkishAlphabet.L_i and last_letter!=TurkishAlphabet.L_u and\
           last_letter!=TurkishAlphabet.L_ii and last_letter!=TurkishAlphabet.L_uu:
            return []

        first_char_after_partial_input = whole_surface[len(partial_input)]

        if first_char_after_partial_input.isupper():
            return []

        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(first_char_after_partial_input)

        if first_letter_after_partial_input != TurkishAlphabet.L_n:
            return []

        if len(whole_surface) < len(partial_input) + 2: # need a char after char 'n'
            return []

        compound_results = []

        results_with_partial_input_one_char_missing = self.brute_force_noun_root_finder.find_roots_for_partial_input(partial_input[:-1], whole_surface)

        # illustrate:
        # partial_input = suborusu, whole_surface = suborusuna
        # results_with_partial_input_one_char_missing : <'suborus','suborus'>
        # partial_input = bacakkalemi, whole_surface = bacakkalemini
        # results_with_partial_input_one_char_missing : <'bacakkalem','bacakkalem'>

        for normal_noun_result in results_with_partial_input_one_char_missing:
            clone_result = normal_noun_result._clone(True)
            clone_result.str = clone_result.lexeme.root
            clone_result.lexeme.root = partial_input
            clone_result.lexeme.lemma = partial_input

            compound_results.append(clone_result)


        previous_letter = TurkishAlphabet.get_letter_for_char(previous_char)

        if previous_letter==TurkishAlphabet.L_s:
            results_with_partial_input_two_chars_missing = self.brute_force_noun_root_finder.find_roots_for_partial_input(partial_input[:-2], whole_surface)

            # illustrate:
            # partial_input = suborusu, whole_surface = suborusuna
            # results_with_partial_input_two_chars_missing : <'suboru','suboru'>

            for normal_noun_result in results_with_partial_input_two_chars_missing:
                clone_result = normal_noun_result._clone(True)
                clone_result.lexeme.root = partial_input
                clone_result.lexeme.lemma = partial_input

                compound_results.append(clone_result)


        for compound_result in compound_results:
            compound_result.lexeme.attributes.add(LexemeAttribute.CompoundP3sg)

        return compound_results
    def _test_should_parse_simple_parse_set(self, set_number, start_index=0):
        path = os.path.join(
            os.path.dirname(__file__),
            '../../../../testresources/simpleparsesets/simpleparseset{}.txt'.
            format(set_number))
        logger.info("Parsing simple parse set {}".format(path))
        skipped = 0
        unparsable = 0
        comment = 0
        with codecs.open(path, 'r', 'utf-8-sig') as parse_set_file:
            index = 0
            for line in parse_set_file:
                if start_index > index:
                    index += 1
                    continue

                if line.startswith('#'):
                    comment += 1
                    index += 1
                    continue

                line = line.strip()
                (word, parse_result) = line.split('=')
                if any([
                        case_to_skip in parse_result
                        for case_to_skip in cases_to_skip
                ]) or word in words_to_skip:
                    if self.LOG_SKIPPED:
                        logger.info(u'Skipped : {} {} {}'.format(
                            index, word, parse_result))
                    skipped += 1
                    index += 1
                    continue

                #TODO
                parse_result = parse_result.replace('Prog1', 'Prog')
                parse_result = parse_result.replace('Prog2', 'Prog')
                parse_result = parse_result.replace('Inf1', 'Inf')
                parse_result = parse_result.replace('Inf2', 'Inf')
                parse_result = parse_result.replace('Inf3', 'Inf')
                parse_result = parse_result.replace('WithoutHavingDoneSo1',
                                                    'WithoutHavingDoneSo')
                parse_result = parse_result.replace('WithoutHavingDoneSo2',
                                                    'WithoutHavingDoneSo')

                #TODO
                parse_result = parse_result.replace('Hastily', 'Hastily+Pos')

                parse_result = parse_result.replace('Postp+PCNom', 'Part')
                parse_result = parse_result.replace('Postp+PCDat', 'Postp')
                parse_result = parse_result.replace('Postp+PCAcc', 'Postp')
                parse_result = parse_result.replace('Postp+PCLoc', 'Postp')
                parse_result = parse_result.replace('Postp+PCAbl', 'Postp')
                parse_result = parse_result.replace('Postp+PCIns', 'Postp')
                parse_result = parse_result.replace('Postp+PCGen', 'Postp')

                if self.STATS_MODE:
                    try:
                        self.assert_parse_correct(word, index, parse_result)
                    except Exception:
                        unparsable += 1
                        logger.info(u'Unparsable : {} {} {}'.format(
                            index, word, parse_result))
                else:
                    self.assert_parse_correct(TurkishAlphabet.lower(word),
                                              index, parse_result)

                index += 1

        if self.STATS_MODE:
            logger.info("Finished simple parse set {}".format(path))
            logger.info("Found {} lines, with {} lines of comments".format(
                index, comment))
            logger.info("Skipped {}, unparsable {}".format(
                skipped, unparsable))
            logger.info("Words that should be parsable : {}".format(index -
                                                                    comment))
            logger.info("Parse success rate : {}".format(
                float(index - comment - skipped - unparsable) /
                float(index - comment)))
Example #35
0
    def _infer_morphemic_attributes(cls, lexeme):
        """
        @type lexeme: Lexeme
        """
        item_root = lexeme.root
        root_vowel_count = cls._vowel_count(item_root)
        last_letter = TurkishAlphabet.get_letter_for_char(item_root[-1])

        if lexeme.syntactic_category == SyntacticCategory.VERB:
            if last_letter.vowel:
                lexeme.attributes.add(LexemeAttribute.ProgressiveVowelDrop)
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if root_vowel_count > 1 and LexemeAttribute.Aorist_A not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_I)

            if root_vowel_count == 1 and LexemeAttribute.Aorist_I not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.Aorist_A)

            if last_letter == TurkishAlphabet.L_l:
                lexeme.attributes.add(LexemeAttribute.Passive_In)

            if all(a not in lexeme.attributes
                   for a in LexemeAttribute.CAUSATIVES):
                if last_letter.vowel or (last_letter in [
                        TurkishAlphabet.L_l, TurkishAlphabet.L_r
                ]) and root_vowel_count > 1:
                    lexeme.attributes.add(LexemeAttribute.Causative_t)
                elif last_letter == TurkishAlphabet.L_t and root_vowel_count < 2:
                    lexeme.attributes.add(LexemeAttribute.Causative_Ir)
                else:
                    lexeme.attributes.add(LexemeAttribute.Causative_dIr)

            if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

            if LexemeAttribute.Voicing not in lexeme.attributes and LexemeAttribute.NoVoicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category == SyntacticCategory.NOUN and LexemeAttribute.CompoundP3sg in lexeme.attributes:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            elif LexemeAttribute.Voicing not in lexeme.attributes:
                lexeme.attributes.add(LexemeAttribute.NoVoicing)

        elif lexeme.syntactic_category in [
                SyntacticCategory.NOUN, SyntacticCategory.ADJECTIVE
        ]:
            if LexemeAttribute.VoicingOpt in lexeme.attributes:
                if LexemeAttribute.Voicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.Voicing)
                if LexemeAttribute.NoVoicing in lexeme.attributes:
                    lexeme.attributes.remove(LexemeAttribute.NoVoicing)
            else:
                if root_vowel_count>1 and last_letter.voiceless and not last_letter.continuant and LexemeAttribute.NoVoicing not in lexeme.attributes \
                and LexemeAttribute.InverseHarmony not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif item_root.endswith('nk') or item_root.endswith(
                        'og') or item_root.endswith('rt'):
                    lexeme.attributes.add(LexemeAttribute.Voicing)
                elif LexemeAttribute.Voicing not in lexeme.attributes:
                    lexeme.attributes.add(LexemeAttribute.NoVoicing)
Example #36
0
    def _has_vowel(cls, seq):
        for s in seq:
            if TurkishAlphabet.get_letter_for_char(s).vowel:
                return True

        return False
Example #37
0
    def _generate_modified_root_nodes(cls, lexeme):
        if LexemeAttribute.RootChange in lexeme.attributes:
            special_roots = cls._handle_special_roots(lexeme)
            if special_roots:
                return special_roots

        modified_seq = lexeme.root

        original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
            lexeme.root)
        modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
            lexeme.root)
        original_phonetic_expectations = set()
        modified_phonetic_expectations = set()

        if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes:
            last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1])
            modified_letter = TurkishAlphabet.voice(last_letter)
            assert modified_letter is not None
            if lexeme.lemma.endswith(u"nk"):
                modified_letter = TurkishAlphabet.L_g
            modified_seq = modified_seq[:-1] + modified_letter.char_value
            if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes:
                modified_attributes.remove(
                    PhoneticAttributes.LastLetterVoicelessStop)
            if modified_letter.continuant:
                if PhoneticAttributes.LastLetterNotContinuant in modified_attributes:
                    modified_attributes.remove(
                        PhoneticAttributes.LastLetterNotContinuant)
                modified_attributes.add(
                    PhoneticAttributes.LastLetterContinuant)
            else:
                if PhoneticAttributes.LastLetterContinuant in modified_attributes:
                    modified_attributes.remove(
                        PhoneticAttributes.LastLetterContinuant)
                modified_attributes.add(
                    PhoneticAttributes.LastLetterNotContinuant)
            if LexemeAttribute.VoicingOpt not in lexeme.attributes:
                original_phonetic_expectations.add(
                    PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.Doubling in lexeme.attributes:
            modified_seq = modified_seq + modified_seq[-1]
            original_phonetic_expectations.add(
                PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.LastVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-2] + modified_seq[-1]
            if lexeme.syntactic_category != SyntacticCategory.VERB:
                original_phonetic_expectations.add(
                    PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.InverseHarmony in lexeme.attributes:
            original_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in original_attributes:
                original_attributes.remove(PhoneticAttributes.LastVowelBack)
            modified_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in modified_attributes:
                modified_attributes.remove(PhoneticAttributes.LastVowelBack)

        if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-1]
            if RootGenerator._has_vowel(modified_seq):
                modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
                    modified_seq)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        original_phonetic_expectations = original_phonetic_expectations or None
        modified_phonetic_expectations = modified_phonetic_expectations or None

        original = Root(lexeme.root, lexeme, original_phonetic_expectations,
                        original_attributes)
        modified = Root(modified_seq, lexeme, modified_phonetic_expectations,
                        modified_attributes)

        if original == modified:
            return [original]
        else:
            return [original, modified]
Example #38
0
    def _has_vowel(cls, seq):
        for s in seq:
            if TurkishAlphabet.get_letter_for_char(s).vowel:
                return True

        return False
Example #39
0
    def create_word_binding_from_morpheme_container(self, word_str,
                                                    morpheme_container):
        assert (word_str == morpheme_container.get_surface_so_far()) or (
            TurkishAlphabet.lower(word_str[0]) + word_str[1:]
            == morpheme_container.get_surface_so_far())

        root_str = morpheme_container.get_root().str
        lemma = morpheme_container.get_root().lexeme.lemma
        lemma_root = morpheme_container.get_root().lexeme.root
        root_syntactic_category = morpheme_container.get_root(
        ).lexeme.syntactic_category
        root_secondary_syntactic_category = morpheme_container.get_root(
        ).lexeme.secondary_syntactic_category
        root = RootBinding(root_str, lemma, lemma_root,
                           root_syntactic_category,
                           root_secondary_syntactic_category)

        word_syntactic_category = morpheme_container.get_surface_syntactic_category(
        )
        word_secondary_syntactic_category = morpheme_container.get_surface_secondary_syntactic_category(
        )

        parse_result = formatter.format_morpheme_container_for_parseset(
            morpheme_container)
        word = WordBinding(word_str, parse_result, root,
                           word_syntactic_category,
                           word_secondary_syntactic_category)

        if morpheme_container.get_transitions():
            so_far = root_str
            for transition in morpheme_container.get_transitions():
                if isinstance(
                        transition.suffix_form_application.suffix_form.suffix,
                        FreeTransitionSuffix):
                    continue

                suffix_name = transition.suffix_form_application.suffix_form.suffix.name
                suffix_pretty_name = transition.suffix_form_application.suffix_form.suffix.pretty_name
                suffix_form = transition.suffix_form_application.suffix_form.form
                suffix_application = transition.suffix_form_application.fitting_suffix_form
                suffix_actual_application = transition.suffix_form_application.actual_suffix_form
                word_with_suffix_application = None
                if (so_far + suffix_actual_application) == root_str:
                    word_with_suffix_application = morpheme_container.get_root(
                    ).lexeme.root + suffix_application
                else:
                    word_with_suffix_application = so_far + suffix_application
                so_far += suffix_actual_application
                if transition.is_derivational():
                    suffix = DerivationalSuffixBinding(
                        suffix_name, suffix_pretty_name, suffix_form,
                        suffix_application, suffix_actual_application,
                        word_with_suffix_application, so_far,
                        transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
                else:
                    suffix = InflectionalSuffixBinding(
                        suffix_name, suffix_pretty_name, suffix_form,
                        suffix_application, suffix_actual_application,
                        word_with_suffix_application, so_far,
                        transition.to_state.syntactic_category)
                    word.suffixes.append(suffix)
        return word
Example #40
0
    def _generate_modified_root_nodes(cls, lexeme):
        if LexemeAttribute.RootChange in lexeme.attributes:
            special_roots = cls._handle_special_roots(lexeme)
            if special_roots:
                return special_roots

        modified_seq = lexeme.root

        original_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root)
        modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(lexeme.root)
        original_phonetic_expectations = set()
        modified_phonetic_expectations = set()

        if LexemeAttribute.Voicing in lexeme.attributes or LexemeAttribute.VoicingOpt in lexeme.attributes:
            last_letter = TurkishAlphabet.get_letter_for_char(modified_seq[-1])
            modified_letter = TurkishAlphabet.voice(last_letter)
            assert modified_letter is not None
            if lexeme.lemma.endswith(u"nk"):
                modified_letter = TurkishAlphabet.L_g
            modified_seq = modified_seq[:-1] + modified_letter.char_value
            if PhoneticAttributes.LastLetterVoicelessStop in modified_attributes:
                modified_attributes.remove(PhoneticAttributes.LastLetterVoicelessStop)
            if modified_letter.continuant:
                if PhoneticAttributes.LastLetterNotContinuant in modified_attributes :
                    modified_attributes.remove(PhoneticAttributes.LastLetterNotContinuant)
                modified_attributes.add(PhoneticAttributes.LastLetterContinuant)
            else:
                if PhoneticAttributes.LastLetterContinuant in modified_attributes:
                    modified_attributes.remove(PhoneticAttributes.LastLetterContinuant)
                modified_attributes.add(PhoneticAttributes.LastLetterNotContinuant)
            if LexemeAttribute.VoicingOpt not in lexeme.attributes:
                original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.Doubling in lexeme.attributes:
            modified_seq = modified_seq + modified_seq[-1]
            original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.LastVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-2] + modified_seq[-1]
            if lexeme.syntactic_category!=SyntacticCategory.VERB:
                original_phonetic_expectations.add(PhoneticExpectation.ConsonantStart)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)

        if LexemeAttribute.InverseHarmony in lexeme.attributes:
            original_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in original_attributes:
                original_attributes.remove(PhoneticAttributes.LastVowelBack)
            modified_attributes.add(PhoneticAttributes.LastVowelFrontal)
            if PhoneticAttributes.LastVowelBack in modified_attributes:
                modified_attributes.remove(PhoneticAttributes.LastVowelBack)

        if LexemeAttribute.ProgressiveVowelDrop in lexeme.attributes:
            modified_seq = modified_seq[:-1]
            if RootGenerator._has_vowel(modified_seq):
                modified_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(modified_seq)
            modified_phonetic_expectations.add(PhoneticExpectation.VowelStart)


        original_phonetic_expectations = original_phonetic_expectations or None
        modified_phonetic_expectations = modified_phonetic_expectations or None

        original = Root(lexeme.root, lexeme, original_phonetic_expectations, original_attributes)
        modified = Root(modified_seq, lexeme, modified_phonetic_expectations, modified_attributes)

        if original==modified:
            return [original]
        else:
            return [original, modified]
Example #41
0
    def find_roots_for_partial_input(self, partial_input, whole_surface=None):
        """
        @type partial_input: unicode
        @type whole_surface: unicode
        @rtype: list of Root
        """
        assert partial_input and whole_surface
        assert len(partial_input) <= len(whole_surface)
        assert whole_surface.startswith(partial_input)
        if len(whole_surface) == len(partial_input):
            assert whole_surface == partial_input

        if len(
                partial_input
        ) < 2:  # not possible except (d,diyor) and (y,yiyor). but they are already in the dictionary
            return []

        last_vowel = Phonetics.get_last_vowel(partial_input)

        if not last_vowel:
            return []

        root = partial_input
        lemma = root
        lemma_root = lemma
        syntactic_category = SyntacticCategory.VERB
        secondary_syntactic_category = None
        lexeme_attributes = set()

        lexeme = DynamicLexeme(lemma, lemma_root, syntactic_category,
                               secondary_syntactic_category, lexeme_attributes)

        phonetic_expectations = set()
        phonetic_attributes = Phonetics.calculate_phonetic_attributes_of_plain_sequence(
            partial_input)

        no_attr_root = DynamicRoot(root, lexeme, phonetic_expectations,
                                   phonetic_attributes)

        self._set_lexeme_and_phonetic_attributes([no_attr_root])
        self._set_lemma([no_attr_root])

        last_char = partial_input[-1]
        last_letter = TurkishAlphabet.get_letter_for_char(last_char)

        partial_surface_can_be_root_of_a_verb = self._seems_like_a_valid_verb_root(
            partial_input)

        if whole_surface == partial_input:
            return [no_attr_root
                    ] if partial_surface_can_be_root_of_a_verb else []

        first_char_after_partial_input = whole_surface[len(partial_input)]

        if first_char_after_partial_input.isupper():
            return []

        first_letter_after_partial_input = TurkishAlphabet.get_letter_for_char(
            first_char_after_partial_input)


        might_have_ProgressiveVowelDrop = not last_letter.vowel and\
                                          any([whole_surface.startswith(partial_input+s) for s in [u'iyor', u'ıyor', u'uyor', u'üyor']])

        might_have_Aorist_A = not last_letter.vowel and \
                              (whole_surface.startswith(partial_input + u'ar') or whole_surface.startswith(partial_input + u'er'))

        # no Aorist_I for -ur, -ür
        might_have_Aorist_I = not last_letter.vowel and\
                              (whole_surface.startswith(partial_input + u'ır') or whole_surface.startswith(partial_input + u'ir'))

        # for other letters, no voicing in verbs. {git+er->gider} vs {yapar, açar, diker}
        voicing_might_have_happened = last_letter == TurkishAlphabet.L_d and first_letter_after_partial_input.vowel

        possible_progressive_vowel_drop_roots = self._get_progressive_vowel_drop_roots(
            partial_input, whole_surface, no_attr_root,
            last_vowel) if might_have_ProgressiveVowelDrop else set()
        possible_aorist_A_roots = self._get_aorist_A_roots(
            no_attr_root) if might_have_Aorist_A else set()
        possible_aorist_I_roots = self._get_aorist_I_roots(
            no_attr_root) if might_have_Aorist_I else set()
        possible_causative_roots = self._get_possible_causative_roots(
            partial_input, whole_surface, no_attr_root)
        possible_passive_roots = self._get_possible_passive_roots(
            last_letter, partial_input, whole_surface, no_attr_root)

        if voicing_might_have_happened:
            possible_progressive_vowel_drop_roots = possible_progressive_vowel_drop_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_progressive_vowel_drop_roots
                ]))
            possible_aorist_A_roots = possible_aorist_A_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_aorist_A_roots
                ]))
            possible_aorist_I_roots = possible_aorist_I_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_aorist_I_roots
                ]))
            possible_causative_roots = possible_causative_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_causative_roots
                ]))
            possible_passive_roots = possible_passive_roots.union(
                set([
                    self._get_possible_voicing_root(r)
                    for r in possible_passive_roots
                ]))

        generated_roots = set()

        generated_roots.add(no_attr_root)

        if voicing_might_have_happened:
            generated_roots.add(self._get_possible_voicing_root(no_attr_root))

        generated_roots = generated_roots.union(
            possible_progressive_vowel_drop_roots)
        generated_roots = generated_roots.union(possible_aorist_A_roots)
        generated_roots = generated_roots.union(possible_aorist_I_roots)
        generated_roots = generated_roots.union(possible_causative_roots)
        generated_roots = generated_roots.union(possible_passive_roots)

        self._set_lexeme_and_phonetic_attributes(generated_roots)
        self._set_lemma(generated_roots)

        generated_roots = list(generated_roots)

        generated_roots = filter(
            lambda r: self._seems_like_a_valid_verb_root(r.lexeme.root),
            generated_roots)

        return generated_roots
    def _test_should_parse_simple_parse_set(self, set_number, start_index=0):
        path = os.path.join(
            os.path.dirname(__file__),
            "../../../../testresources/simpleparsesets/simpleparseset{}.txt".format(set_number),
        )
        logger.info("Parsing simple parse set {}".format(path))
        skipped = 0
        unparsable = 0
        comment = 0
        with codecs.open(path, "r", "utf-8-sig") as parse_set_file:
            index = 0
            for line in parse_set_file:
                if start_index > index:
                    index += 1
                    continue

                if line.startswith("#"):
                    comment += 1
                    index += 1
                    continue

                line = line.strip()
                (word, parse_result) = line.split("=")
                if any([case_to_skip in parse_result for case_to_skip in cases_to_skip]) or word in words_to_skip:
                    if self.LOG_SKIPPED:
                        logger.info("Skipped : {} {} {}".format(index, word, parse_result))
                    skipped += 1
                    index += 1
                    continue

                # TODO
                parse_result = parse_result.replace("Prog1", "Prog")
                parse_result = parse_result.replace("Prog2", "Prog")
                parse_result = parse_result.replace("Inf1", "Inf")
                parse_result = parse_result.replace("Inf2", "Inf")
                parse_result = parse_result.replace("Inf3", "Inf")
                parse_result = parse_result.replace("WithoutHavingDoneSo1", "WithoutHavingDoneSo")
                parse_result = parse_result.replace("WithoutHavingDoneSo2", "WithoutHavingDoneSo")

                # TODO
                parse_result = parse_result.replace("Hastily", "Hastily+Pos")

                parse_result = parse_result.replace("Postp+PCNom", "Part")
                parse_result = parse_result.replace("Postp+PCDat", "Postp")
                parse_result = parse_result.replace("Postp+PCAcc", "Postp")
                parse_result = parse_result.replace("Postp+PCLoc", "Postp")
                parse_result = parse_result.replace("Postp+PCAbl", "Postp")
                parse_result = parse_result.replace("Postp+PCIns", "Postp")
                parse_result = parse_result.replace("Postp+PCGen", "Postp")

                if self.STATS_MODE:
                    try:
                        self.assert_parse_correct(word, index, parse_result)
                    except Exception:
                        unparsable += 1
                        logger.info("Unparsable : {} {} {}".format(index, word, parse_result))
                else:
                    self.assert_parse_correct(TurkishAlphabet.lower(word), index, parse_result)

                index += 1

        if self.STATS_MODE:
            logger.info("Finished simple parse set {}".format(path))
            logger.info("Found {} lines, with {} lines of comments".format(index, comment))
            logger.info("Skipped {}, unparsable {}".format(skipped, unparsable))
            logger.info("Words that should be parsable : {}".format(index - comment))
            logger.info(
                "Parse success rate : {}".format(float(index - comment - skipped - unparsable) / float(index - comment))
            )