def split_ssangbatchim(self, char):
        """Split the double in a tuple

        :param char: char; the final letter of a syllable or a syllable"""
        if char in self.__hangul['ssangbatchim'].keys():
            return self.__hangul['ssangbatchim'].get(char)
        else:
            return self.__hangul['ssangbatchim'].get(kr.get_final(char))
    def has_ssangbatchim(self, syllable):
        """True if param contains a double batchim

        :param syllable: char; a syllable"""
        if kr.is_hangul(syllable):
            return self.is_ssangbatchim(kr.get_final(syllable))
        else:
            return False
    def __get_final_prior(self, syllables, i):
        """Get the batchim of the prior syllable of a word

        :param syllabes: list; syllables in Hangul
        :param i: int; the current index of the syllable in the sentence"""
        if i > 0:
            if syllables[i - 1] != '' and kr.is_hangul(syllables[i - 1]):
                return kr.get_final(syllables[i - 1])

        return ''
Example #4
0
File: josa.py Project: sublee/josa
def has_jongseong(word, lang='eng'):
    if lang == 'kor':
        word = Noun(word)
    else:
        if lang == 'eng':
            lang = 'nld'
        word = Loanword(unicode(word), lang)
    try:
        return bool(hangul.get_final(word.read()[-1]))
    except IndexError:
        raise ValueError
    def batchim(self, syllable, next=''):
        """Convert a final consonant to latin script following grammatical rules

        :param syllable: char; a hangul syllable
        :param prior: char; the initial consonant of the next syllable, if it exists"""

        current = kr.get_final(syllable)

        if current == '':
            return ''

        # ㅇ comes first because it's a special case
        if next != '':
            if next == 'ㅇ':
                if current == 'ㅇ':
                    pass
                elif current in self.__hangul['jaeum'].keys():
                    return self.jaeum(current)
            elif self.is_ssangbatchim(current):
                return self.batchim(self.split_ssangbatchim(current)[0])

            elif next in ('ㄴ', 'ㅁ'):
                if current == 'ㅂ':
                    return self.jaeum('ㅁ')

            elif next in ('ㄱ', 'ㄷ', 'ㅂ'):
                if current == 'ㅎ':
                    return ''

            elif next == 'ㄹ':
                if current in ('ㄴ', 'ㄹ'):
                    return self.jaeum('ㄹㄹ')
            elif next == 'ㅎ':
                if current == 'ㄱ':
                    return self.jaeum('ㅋ')
                if current == 'ㄷ':
                    return self.jaeum('ㅌ')
                if current == 'ㅂ':
                    return self.jaeum('ㅍ')

        return self.__hangul['batchim'].get(current)
Example #6
0
    def search_particle(self, i, branch, nextbranch, trunc = 0):
        #particle we're focusing on here
        particle = branch[i][0]
        
        #print(particle)
        
        if trunc:
            #if we're asked to cut off head of this particle 
            if len(particle) > trunc:
                particle = particle[trunc:]
                preceeding_char = particle[trunc - 1]
            else:
                # skip this particle if too short to trunc 
                # as this method only used as backup
                return []
        else:
            #here we're not truncating, normal case
            if i > 0 and len(branch[i-1]) > 1 and len(branch[i-1][1]) > 0 and branch[i-1][1][0] != "S":
                preceeding_char = branch[i-1][0][-1]
            else:
                preceeding_char = ""
        
        # if we're truncating, then skip any obvious endings
        if trunc and (particle == "요" or particle == "은" or particle == "을" or particle == "는" or particle == "를" or particle == "다"):
            return []
        
        #get final element of that char
        preceeding_char_final = hangul.get_final(preceeding_char)
        
        #get the rest of the branch up to symbol or end
        hit_a_symbol = False
        last_morph_index = -1
        if len(branch) > i + 1:
            for j in range(i + 1, len(branch)):
                #print(j)
                if len(branch[j]) > 1 and len(branch[j][1]) > 0 and branch[j][1][0] == "S":
                    hit_a_symbol = True
                    last_morph_index = j - 1
                    break
                elif j == len(branch) - 1:
                    #then hit end without finding symbol
                    last_morph_index = j
                    break
        remaining_morphs = []
        if last_morph_index >= i + 1:
            remaining_morphs = branch[i + 1: last_morph_index + 1]
        remaining_morphs_text = "".join(morph[0] for morph in remaining_morphs if morph)
        
        nextmorph_text = ""
        if not hit_a_symbol and nextbranch != None and nextbranch and len(nextbranch[0]) > 0:
            nextmorph_text = nextbranch[0][0]
        
        pattern = '^\s*'
        # pattern for supplementary links
        suppattern = '^'
        
        if preceeding_char != "":
            pattern = pattern + '[-~]?\s*'
            suppattern = suppattern + '[-~]?\s*'
            if preceeding_char_final != '':
                pattern = pattern + '(?:' + re.escape(preceeding_char_final) + '\s*)?'
            #pattern = pattern + '(?:\(' + re.escape(preceeding_char) + '\))?\s*'
            #suppattern = suppattern + '(?:\(' + re.escape(preceeding_char) + '\))?\s*'
        
        pattern = pattern + '(?:\(\w\))?\s*'
        suppattern = suppattern + '(?:\(\w\))?\s*'

        pattern = pattern + re.escape(particle) 
        suppattern = suppattern + re.escape(particle) 
        
        if remaining_morphs_text != "":
            # matching remainaing_morphs
            pattern = pattern + '(?:\(?' + re.escape(remaining_morphs_text) + '\)?)?'
            suppattern = suppattern + '(?:\(?' + re.escape(remaining_morphs_text) + '\)?)?'
        
        # matching bracketed trailing characters 
        pattern = pattern + '(?:\([요는를은을요이]\))?[0-9]?(?:\s+|$)'
        
        if nextmorph_text != "":
            # then must also match next word 
            pattern = pattern + '(?:' + re.escape(nextmorph_text) + '.*$|\s*$)'
            #to include english text trailing add this to above: |[a-zA-Z0-9].*$|
        else:
            # then must match end as no later word
            pattern = pattern + '\s*$'
            #to include english text trailing add this to above: |[a-zA-Z0-9].*$|
        suppattern = suppattern + '$'
        
        #print(pattern)
        #print(suppattern)
        
        grammarlinks = []
        grammarlinks.extend(self.supplementary_matches_search(suppattern))
        #print(len(grammarlinks))                    
        grammarlinks.extend(self.matches_search(pattern))
        #print(len(grammarlinks))
        
        #if particle == "은":
        #print("branch i; particle, preceeding_char, preceeding_char_final, hit_a_symbol, last_morph_index, remaining_morphs, nextmorph_text, nextbranch")
        #print(str(branch) + " " + str(i) + "\t" + str(particle) + "\t" + str(preceeding_char) + "\t" + str(preceeding_char_final) + "\t" + str(hit_a_symbol) + "\t'" + str(last_morph_index) + "'\t'" + str(remaining_morphs) + "'\t'" + str(nextmorph_text) + "'\t" + str(nextbranch))
        
        return grammarlinks