def split_ssangbatchim(self, char): """Split the double in a tuple :param char: char; the final letter of a syllable or a syllable""" if char in self.__hangul['ssangbatchim'].keys(): return self.__hangul['ssangbatchim'].get(char) else: return self.__hangul['ssangbatchim'].get(kr.get_final(char))
def has_ssangbatchim(self, syllable): """True if param contains a double batchim :param syllable: char; a syllable""" if kr.is_hangul(syllable): return self.is_ssangbatchim(kr.get_final(syllable)) else: return False
def __get_final_prior(self, syllables, i): """Get the batchim of the prior syllable of a word :param syllabes: list; syllables in Hangul :param i: int; the current index of the syllable in the sentence""" if i > 0: if syllables[i - 1] != '' and kr.is_hangul(syllables[i - 1]): return kr.get_final(syllables[i - 1]) return ''
def has_jongseong(word, lang='eng'): if lang == 'kor': word = Noun(word) else: if lang == 'eng': lang = 'nld' word = Loanword(unicode(word), lang) try: return bool(hangul.get_final(word.read()[-1])) except IndexError: raise ValueError
def batchim(self, syllable, next=''): """Convert a final consonant to latin script following grammatical rules :param syllable: char; a hangul syllable :param prior: char; the initial consonant of the next syllable, if it exists""" current = kr.get_final(syllable) if current == '': return '' # ㅇ comes first because it's a special case if next != '': if next == 'ㅇ': if current == 'ㅇ': pass elif current in self.__hangul['jaeum'].keys(): return self.jaeum(current) elif self.is_ssangbatchim(current): return self.batchim(self.split_ssangbatchim(current)[0]) elif next in ('ㄴ', 'ㅁ'): if current == 'ㅂ': return self.jaeum('ㅁ') elif next in ('ㄱ', 'ㄷ', 'ㅂ'): if current == 'ㅎ': return '' elif next == 'ㄹ': if current in ('ㄴ', 'ㄹ'): return self.jaeum('ㄹㄹ') elif next == 'ㅎ': if current == 'ㄱ': return self.jaeum('ㅋ') if current == 'ㄷ': return self.jaeum('ㅌ') if current == 'ㅂ': return self.jaeum('ㅍ') return self.__hangul['batchim'].get(current)
def search_particle(self, i, branch, nextbranch, trunc = 0): #particle we're focusing on here particle = branch[i][0] #print(particle) if trunc: #if we're asked to cut off head of this particle if len(particle) > trunc: particle = particle[trunc:] preceeding_char = particle[trunc - 1] else: # skip this particle if too short to trunc # as this method only used as backup return [] else: #here we're not truncating, normal case if i > 0 and len(branch[i-1]) > 1 and len(branch[i-1][1]) > 0 and branch[i-1][1][0] != "S": preceeding_char = branch[i-1][0][-1] else: preceeding_char = "" # if we're truncating, then skip any obvious endings if trunc and (particle == "요" or particle == "은" or particle == "을" or particle == "는" or particle == "를" or particle == "다"): return [] #get final element of that char preceeding_char_final = hangul.get_final(preceeding_char) #get the rest of the branch up to symbol or end hit_a_symbol = False last_morph_index = -1 if len(branch) > i + 1: for j in range(i + 1, len(branch)): #print(j) if len(branch[j]) > 1 and len(branch[j][1]) > 0 and branch[j][1][0] == "S": hit_a_symbol = True last_morph_index = j - 1 break elif j == len(branch) - 1: #then hit end without finding symbol last_morph_index = j break remaining_morphs = [] if last_morph_index >= i + 1: remaining_morphs = branch[i + 1: last_morph_index + 1] remaining_morphs_text = "".join(morph[0] for morph in remaining_morphs if morph) nextmorph_text = "" if not hit_a_symbol and nextbranch != None and nextbranch and len(nextbranch[0]) > 0: nextmorph_text = nextbranch[0][0] pattern = '^\s*' # pattern for supplementary links suppattern = '^' if preceeding_char != "": pattern = pattern + '[-~]?\s*' suppattern = suppattern + '[-~]?\s*' if preceeding_char_final != '': pattern = pattern + '(?:' + re.escape(preceeding_char_final) + '\s*)?' #pattern = pattern + '(?:\(' + re.escape(preceeding_char) + '\))?\s*' #suppattern = suppattern + '(?:\(' + re.escape(preceeding_char) + '\))?\s*' pattern = pattern + '(?:\(\w\))?\s*' suppattern = suppattern + '(?:\(\w\))?\s*' pattern = pattern + re.escape(particle) suppattern = suppattern + re.escape(particle) if remaining_morphs_text != "": # matching remainaing_morphs pattern = pattern + '(?:\(?' + re.escape(remaining_morphs_text) + '\)?)?' suppattern = suppattern + '(?:\(?' + re.escape(remaining_morphs_text) + '\)?)?' # matching bracketed trailing characters pattern = pattern + '(?:\([요는를은을요이]\))?[0-9]?(?:\s+|$)' if nextmorph_text != "": # then must also match next word pattern = pattern + '(?:' + re.escape(nextmorph_text) + '.*$|\s*$)' #to include english text trailing add this to above: |[a-zA-Z0-9].*$| else: # then must match end as no later word pattern = pattern + '\s*$' #to include english text trailing add this to above: |[a-zA-Z0-9].*$| suppattern = suppattern + '$' #print(pattern) #print(suppattern) grammarlinks = [] grammarlinks.extend(self.supplementary_matches_search(suppattern)) #print(len(grammarlinks)) grammarlinks.extend(self.matches_search(pattern)) #print(len(grammarlinks)) #if particle == "은": #print("branch i; particle, preceeding_char, preceeding_char_final, hit_a_symbol, last_morph_index, remaining_morphs, nextmorph_text, nextbranch") #print(str(branch) + " " + str(i) + "\t" + str(particle) + "\t" + str(preceeding_char) + "\t" + str(preceeding_char_final) + "\t" + str(hit_a_symbol) + "\t'" + str(last_morph_index) + "'\t'" + str(remaining_morphs) + "'\t'" + str(nextmorph_text) + "'\t" + str(nextbranch)) return grammarlinks