def get_next_char(infile): for rawline in infile: for rawchar in rawline.decode("utf-8", "ignore"): if hangul.ishangul(rawchar): for jamo in hangul.split(rawchar): if jamo != '': yield jamo else: yield rawchar
def get_next_char(infile): char_upperbound = 179 for rawline in infile: for rawchar in rawline.decode("utf-8", "ignore"): if hangul.ishangul(rawchar): for jamo in hangul.split(rawchar): if jamo != '': yield encode_jamo(jamo) else: other_char = rawchar.encode("utf-8") if (len(other_char) > 1) or (ord(other_char) > char_upperbound): other_char = ' ' yield other_char
def decompose(uni_word): """ decompose syllables into consonants and vowels :param uni_word: word :return: decomposed word """ if isinstance(uni_word, str): uni_word = unicode(uni_word, "UTF-8") chars = [] for uni_char in uni_word: if not hangul.ishangul(uni_char): chars.append(uni_char) continue chars.extend([_JONG_TO_CHO.get(alpha, alpha) for alpha in hangul.split(uni_char) if alpha]) return (u"".join(chars)).encode("UTF-8")
def decompose(uni_word): """ decompose syllables into consonants and vowels :param uni_word: word :return: decomposed word """ if isinstance(uni_word, str): uni_word = unicode(uni_word, 'UTF-8') chars = [] for uni_char in uni_word: if not hangul.ishangul(uni_char): chars.append(uni_char) continue chars.extend([ _JONG_TO_CHO.get(alpha, alpha) for alpha in hangul.split(uni_char) if alpha ]) return (u''.join(chars)).encode('UTF-8')
def get_joined_char(infile): for charlist in get_next_char_list(infile): i = 0 while i < len(charlist): leftchar = len(charlist) - 1 - i if leftchar == 1: yield charlist[i] i += 1 elif leftchar == 2: if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]): yield hangul.join((charlist[i], charlist[i+1], '')) i += 2 else: yield charlist[i] i += 1 elif leftchar == 3: if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]): if hangul.isJaeum(charlist[i+2]): yield hangul.join((charlist[i], charlist[i+1], charlist[i+2])) i += 3 else: yield hangul.join((charlist[i], charlist[i+1], '')) i += 2 else: yield charlist[i] i += 1 else: if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]): if hangul.isJaeum(charlist[i+2]) and \ (hangul.isJaeum(charlist[i+3]) or (not hangul.ishangul(charlist[i+3]))): yield hangul.join((charlist[i], charlist[i+1], charlist[i+2])) i += 3 else: yield hangul.join((charlist[i], charlist[i+1], '')) i += 2 else: yield charlist[i] i += 1
def print_aligned(fout, sent_pairs): """ print aligned (surface, morpheme) pairs :param fout: output file :param sent_pairs: list of (surface, morpheme) pairs list """ for word_idx, pairs in enumerate(sent_pairs): prev_word = sent_pairs[word_idx-1] if word_idx > 0 else None next_word = sent_pairs[word_idx+1] if word_idx < len(sent_pairs)-1 else None for pair_idx, (surface, morphs) in enumerate(pairs): if pair_idx > 0: prev_pair = pairs[pair_idx-1] else: prev_pair = prev_word[-1] if prev_word else None if pair_idx < len(pairs)-1: next_pair = pairs[pair_idx+1] else: next_pair = next_word[0] if next_word else None if isinstance(surface, unicode): surface = surface.encode('UTF-8') for morph_idx, morph in enumerate(morphs): if morph_idx > 0: prev_morph = morphs[morph_idx-1] else: prev_morph = prev_pair[1][-1] if prev_pair else None if morph_idx < len(morphs)-1: next_morph = morphs[morph_idx+1] else: next_morph = next_pair[1][0] if next_pair else None features = [] features.append('L_0=%s' % morph.lex) # current morpheme features.append('S_0=%s' % surface) # current surface morph_uni = unicode(morph.lex, 'UTF-8') if hangul.ishangul(morph_uni[0]): initial_consonant = hangul.split(morph_uni[0])[0] if initial_consonant and initial_consonant != u'ㅇ': features.append('CIC') # current initial consonant logging.debug(u'Current Initial Consonant: (%s)%s', initial_consonant, morph_uni) if prev_morph: features.append('L-1=%s' % prev_morph.lex) # previous morpheme prev_morph_uni = unicode(prev_morph.lex, 'UTF-8') if hangul.ishangul(prev_morph_uni[-1]): final_consonant = hangul.split(prev_morph_uni[-1])[-1] if final_consonant: pfc = 'PFC' if final_consonant == u'ㄹ': pfc += '=ㄹ' features.append(pfc) # previous final consonant logging.debug(u'Previous Final Consonant: %s(%s)', prev_morph_uni, final_consonant) else: features.append('BOS') # begin of sentence if next_morph: features.append('L+1=%s' % next_morph.lex) # next morpheme else: features.append('EOS') # end of sentence if prev_pair: prev_surface = prev_pair[0] if isinstance(prev_surface, unicode): prev_surface = prev_surface.encode('UTF-8') features.append('S-1=%s' % prev_surface) # previous surface if next_pair: next_surface = next_pair[0] if isinstance(next_surface, unicode): next_surface = next_surface.encode('UTF-8') features.append('S+1=%s' % next_surface) # next surface if pair_idx == 0 and morph_idx == 0: features.append('LSP') # left space (space at left) if pair_idx == len(pairs)-1 and morph_idx == len(morphs)-1: features.append('RSP') # right space (space at right) print >> fout, '%s\t%s' % (morph.tag, '\t'.join(features)) print >> fout
def convertToEng(s): phone = { # jaum u'\u3131':u'r', # ぁ u'\u3132':u'R', # あ u'\u3133':u'rt', # ぃ u'\u3134':u's', # い u'\u3135':u'sw', # ぅ u'\u3136':u'sg', # う u'\u3137':u'e', # ぇ u'\u3138':u'E', # え u'\u3139':u'f', # ぉ u'\u313a':u'fr', # お u'\u313b':u'fv', # か u'\u313c':u'fq', # が u'\u313d':u'ft', # き u'\u313e':u'fx', # ぎ u'\u313f':u'fv', # く u'\u3140':u'fg', # ぐ u'\u3141':u'a', # け u'\u3142':u'q', # げ u'\u3143':u'Q', # こ u'\u3144':u'qt', # ご u'\u3145':u't', # さ u'\u3146':u'T', # ざ u'\u3147':u'd', # し u'\u3148':u'w', # じ u'\u3149':u'W', # す u'\u314a':u'c', # ず u'\u314b':u'z', # せ u'\u314c':u'x', # ぜ u'\u314d':u'v', # そ u'\u314e':u'g', # ぞ # moum u'\u314f':u'k', # た u'\u3150':u'o', # だ u'\u3151':u'i', # ち u'\u3152':u'O', # ぢ u'\u3153':u'j', # っ u'\u3154':u'p', # つ u'\u3155':u'u', # づ u'\u3156':u'P', # て u'\u3157':u'h', # で u'\u3158':u'hk', # と u'\u3159':u'ho', # ど u'\u315a':u'hl', # な u'\u315b':u'y', # に u'\u315c':u'n', # ぬ u'\u315d':u'nj', # ね u'\u315e':u'np', # の u'\u315f':u'nl', # は u'\u3160':u'b', # ば u'\u3161':u'm', # ぱ u'\u3162':u'ml', # ひ u'\u3163':u'l', # び } re = []; for i in s: if not hangul.ishangul(i): re.append(i) continue t = hangul.split(i) c = 0 for g in t: c = c + 1 if g == u'': continue # 段失拭 戚誓戚 臣凶澗 巷製 坦軒 if c == 1 and phone[g] == 'ng': continue else: if phone.has_key(g): ph = phone[g] re.append(ph) return re
def getPhone(s): phone = { # jaum u'\u3131':u'g', # ぁ u'\u3132':u'gg', # あ u'\u3133':u'gs', # ぃ u'\u3134':u'n', # い u'\u3135':u'nj', # ぅ u'\u3136':u'nh', # う u'\u3137':u'd', # ぇ u'\u3138':u'dd', # え u'\u3139':u'r', # ぉ u'\u313a':u'lg', # お u'\u313b':u'lm', # か u'\u313c':u'lb', # が u'\u313d':u'ls', # き u'\u313e':u'lt', # ぎ u'\u313f':u'lp', # く u'\u3140':u'lh', # ぐ u'\u3141':u'm', # け u'\u3142':u'b', # げ u'\u3143':u'bb', # こ u'\u3144':u'bs', # ご u'\u3145':u's', # さ u'\u3146':u'ss', # ざ u'\u3147':u'ng', # し u'\u3148':u'j', # じ u'\u3149':u'jj', # す u'\u314a':u'ch', # ず u'\u314b':u'k', # せ u'\u314c':u't', # ぜ u'\u314d':u'p', # そ u'\u314e':u'h', # ぞ # moum u'\u314f':u'a', # た u'\u3150':u'ae', # だ u'\u3151':u'ya', # ち u'\u3152':u'yae', # ぢ u'\u3153':u'eo', # っ u'\u3154':u'e', # つ u'\u3155':u'yeo', # づ u'\u3156':u'ye', # て u'\u3157':u'o', # で u'\u3158':u'wa', # と u'\u3159':u'wae', # ど u'\u315a':u'oe', # な u'\u315b':u'yo', # に u'\u315c':u'u', # ぬ u'\u315d':u'weo', # ね u'\u315e':u'we', # の u'\u315f':u'wi', # は u'\u3160':u'yu', # ば u'\u3161':u'eu', # ぱ u'\u3162':u'yi', # ひ u'\u3163':u'i', # び } re = []; for i in s: if not hangul.ishangul(i): re.append(i) continue t = hangul.split(i) c = 0 for g in t: c = c + 1 if g == u'': continue # 段失拭 戚誓戚 臣凶澗 巷製 坦軒 if c == 1 and phone[g] == 'ng': continue else: if phone.has_key(g): ph = phone[g] re.append(ph) return re