def get_next_char(infile):
    for rawline in infile:
        for rawchar in rawline.decode("utf-8", "ignore"):
            if hangul.ishangul(rawchar):
                for jamo in hangul.split(rawchar):
                    if jamo != '':
                        yield jamo
            else:
                yield rawchar
def get_next_char(infile):
    char_upperbound = 179
    
    for rawline in infile:
        for rawchar in rawline.decode("utf-8", "ignore"):
            if hangul.ishangul(rawchar):
                for jamo in hangul.split(rawchar):
                    if jamo != '':
                        yield encode_jamo(jamo)
            else:
                other_char = rawchar.encode("utf-8")
                if (len(other_char) > 1) or (ord(other_char) > char_upperbound):
                    other_char = ' '
                yield other_char
Beispiel #3
0
def decompose(uni_word):
    """
  decompose syllables into consonants and vowels
  :param    uni_word:  word
  :return:             decomposed word
  """
    if isinstance(uni_word, str):
        uni_word = unicode(uni_word, "UTF-8")
    chars = []
    for uni_char in uni_word:
        if not hangul.ishangul(uni_char):
            chars.append(uni_char)
            continue
        chars.extend([_JONG_TO_CHO.get(alpha, alpha) for alpha in hangul.split(uni_char) if alpha])
    return (u"".join(chars)).encode("UTF-8")
Beispiel #4
0
def decompose(uni_word):
    """
  decompose syllables into consonants and vowels
  :param    uni_word:  word
  :return:             decomposed word
  """
    if isinstance(uni_word, str):
        uni_word = unicode(uni_word, 'UTF-8')
    chars = []
    for uni_char in uni_word:
        if not hangul.ishangul(uni_char):
            chars.append(uni_char)
            continue
        chars.extend([
            _JONG_TO_CHO.get(alpha, alpha) for alpha in hangul.split(uni_char)
            if alpha
        ])
    return (u''.join(chars)).encode('UTF-8')
def get_joined_char(infile):
    for charlist in get_next_char_list(infile):
        i = 0
        while i < len(charlist):
            leftchar = len(charlist) - 1 - i
            if leftchar == 1:
                yield charlist[i]
                i += 1
            elif leftchar == 2:
                if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]):
                    yield hangul.join((charlist[i], charlist[i+1], ''))
                    i += 2
                else:
                    yield charlist[i]
                    i += 1
            elif leftchar == 3:
                if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]):
                    if hangul.isJaeum(charlist[i+2]):
                        yield hangul.join((charlist[i], charlist[i+1], charlist[i+2]))
                        i += 3
                    else:
                        yield hangul.join((charlist[i], charlist[i+1], ''))
                        i += 2
                else:
                    yield charlist[i]
                    i += 1
            else:
                if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]):
                    if hangul.isJaeum(charlist[i+2]) and \
                       (hangul.isJaeum(charlist[i+3]) or (not hangul.ishangul(charlist[i+3]))):
                        yield hangul.join((charlist[i], charlist[i+1], charlist[i+2]))
                        i += 3
                    else:
                        yield hangul.join((charlist[i], charlist[i+1], ''))
                        i += 2
                else:
                    yield charlist[i]
                    i += 1
Beispiel #6
0
def print_aligned(fout, sent_pairs):
  """
  print aligned (surface, morpheme) pairs
  :param  fout:        output file
  :param  sent_pairs:  list of (surface, morpheme) pairs list
  """
  for word_idx, pairs in enumerate(sent_pairs):
    prev_word = sent_pairs[word_idx-1] if word_idx > 0 else None
    next_word = sent_pairs[word_idx+1] if word_idx < len(sent_pairs)-1 else None
    for pair_idx, (surface, morphs) in enumerate(pairs):
      if pair_idx > 0:
        prev_pair = pairs[pair_idx-1]
      else:
        prev_pair = prev_word[-1] if prev_word else None
      if pair_idx < len(pairs)-1:
        next_pair = pairs[pair_idx+1]
      else:
        next_pair = next_word[0] if next_word else None
      if isinstance(surface, unicode):
        surface = surface.encode('UTF-8')
      for morph_idx, morph in enumerate(morphs):
        if morph_idx > 0:
          prev_morph = morphs[morph_idx-1]
        else:
          prev_morph = prev_pair[1][-1] if prev_pair else None
        if morph_idx < len(morphs)-1:
          next_morph = morphs[morph_idx+1]
        else:
          next_morph = next_pair[1][0] if next_pair else None
        features = []
        features.append('L_0=%s' % morph.lex)    # current morpheme
        features.append('S_0=%s' % surface)    # current surface
        morph_uni = unicode(morph.lex, 'UTF-8')
        if hangul.ishangul(morph_uni[0]):
          initial_consonant = hangul.split(morph_uni[0])[0]
          if initial_consonant and initial_consonant != u'ㅇ':
            features.append('CIC')    # current initial consonant
            logging.debug(u'Current Initial Consonant: (%s)%s', initial_consonant, morph_uni)
        if prev_morph:
          features.append('L-1=%s' % prev_morph.lex)    # previous morpheme
          prev_morph_uni = unicode(prev_morph.lex, 'UTF-8')
          if hangul.ishangul(prev_morph_uni[-1]):
            final_consonant = hangul.split(prev_morph_uni[-1])[-1]
            if final_consonant:
              pfc = 'PFC'
              if final_consonant == u'ㄹ':
                pfc += '=ㄹ'
              features.append(pfc)    # previous final consonant
              logging.debug(u'Previous Final Consonant: %s(%s)', prev_morph_uni, final_consonant)
        else:
          features.append('BOS')   # begin of sentence
        if next_morph:
          features.append('L+1=%s' % next_morph.lex)    # next morpheme
        else:
          features.append('EOS')    # end of sentence
        if prev_pair:
          prev_surface = prev_pair[0]
          if isinstance(prev_surface, unicode):
            prev_surface = prev_surface.encode('UTF-8')
          features.append('S-1=%s' % prev_surface)    # previous surface
        if next_pair:
          next_surface = next_pair[0]
          if isinstance(next_surface, unicode):
            next_surface = next_surface.encode('UTF-8')
          features.append('S+1=%s' % next_surface)    # next surface
        if pair_idx == 0 and morph_idx == 0:
          features.append('LSP')    # left space (space at left)
        if pair_idx == len(pairs)-1 and morph_idx == len(morphs)-1:
          features.append('RSP')    # right space (space at right)
        print >> fout, '%s\t%s' % (morph.tag, '\t'.join(features))
  print >> fout
def convertToEng(s):
        phone = {  # jaum
                u'\u3131':u'r', # ぁ
                u'\u3132':u'R', # あ
                u'\u3133':u'rt', # ぃ
                u'\u3134':u's', # い
                u'\u3135':u'sw', # ぅ
                u'\u3136':u'sg', # う
                u'\u3137':u'e', # ぇ
                u'\u3138':u'E', # え
                u'\u3139':u'f', # ぉ
                u'\u313a':u'fr', # お
                u'\u313b':u'fv', # か
                u'\u313c':u'fq', # が
                u'\u313d':u'ft', # き
                u'\u313e':u'fx', # ぎ
                u'\u313f':u'fv', # く
                u'\u3140':u'fg', # ぐ
                u'\u3141':u'a',  # け
                u'\u3142':u'q', # げ
                u'\u3143':u'Q',  # こ
                u'\u3144':u'qt', # ご
                u'\u3145':u't',  # さ
                u'\u3146':u'T', # ざ
                u'\u3147':u'd', # し
                u'\u3148':u'w', # じ
                u'\u3149':u'W', # す
                u'\u314a':u'c', # ず
                u'\u314b':u'z', # せ
                u'\u314c':u'x',  # ぜ
                u'\u314d':u'v',  # そ
                u'\u314e':u'g', # ぞ

                # moum 
                u'\u314f':u'k', # た
                u'\u3150':u'o',  # だ
                u'\u3151':u'i',  # ち
                u'\u3152':u'O',  # ぢ
                u'\u3153':u'j',  # っ
                u'\u3154':u'p', # つ
                u'\u3155':u'u',  # づ
                u'\u3156':u'P',  # て
                u'\u3157':u'h',  # で
                u'\u3158':u'hk',  # と
                u'\u3159':u'ho',  # ど
                u'\u315a':u'hl', # な
                u'\u315b':u'y',  # に
                u'\u315c':u'n',  # ぬ
                u'\u315d':u'nj', # ね
                u'\u315e':u'np',  # の
                u'\u315f':u'nl',  # は
                u'\u3160':u'b', # ば
                u'\u3161':u'm',  # ぱ
                u'\u3162':u'ml',  # ひ
                u'\u3163':u'l', # び
        }


        re = [];
        for i in s:
                if not hangul.ishangul(i):
                        re.append(i)
                        continue
                t = hangul.split(i)
                c = 0
                for g in t:
                        c = c + 1
                        if g == u'':
                                continue

                        # 段失拭 戚誓戚 臣凶澗 巷製 坦軒
                        if c == 1 and phone[g] == 'ng':
                                continue
                        else: 
                                if phone.has_key(g):
                                        ph = phone[g]
                                        re.append(ph)
        return re
def getPhone(s):
        phone = {  # jaum
                u'\u3131':u'g', # ぁ
                u'\u3132':u'gg', # あ
                u'\u3133':u'gs', # ぃ
                u'\u3134':u'n', # い
                u'\u3135':u'nj', # ぅ
                u'\u3136':u'nh', # う
                u'\u3137':u'd', # ぇ
                u'\u3138':u'dd', # え
                u'\u3139':u'r', # ぉ
                u'\u313a':u'lg', # お
                u'\u313b':u'lm', # か
                u'\u313c':u'lb', # が
                u'\u313d':u'ls', # き
                u'\u313e':u'lt', # ぎ
                u'\u313f':u'lp', # く
                u'\u3140':u'lh', # ぐ
                u'\u3141':u'm',  # け
                u'\u3142':u'b', # げ
                u'\u3143':u'bb',  # こ
                u'\u3144':u'bs', # ご
                u'\u3145':u's',  # さ
                u'\u3146':u'ss', # ざ
                u'\u3147':u'ng', # し
                u'\u3148':u'j', # じ
                u'\u3149':u'jj', # す
                u'\u314a':u'ch', # ず
                u'\u314b':u'k', # せ
                u'\u314c':u't',  # ぜ
                u'\u314d':u'p',  # そ
                u'\u314e':u'h', # ぞ

                # moum 
                u'\u314f':u'a', # た
                u'\u3150':u'ae',  # だ
                u'\u3151':u'ya',  # ち
                u'\u3152':u'yae',  # ぢ
                u'\u3153':u'eo',  # っ
                u'\u3154':u'e', # つ
                u'\u3155':u'yeo',  # づ
                u'\u3156':u'ye',  # て
                u'\u3157':u'o',  # で
                u'\u3158':u'wa',  # と
                u'\u3159':u'wae',  # ど
                u'\u315a':u'oe', # な
                u'\u315b':u'yo',  # に
                u'\u315c':u'u',  # ぬ
                u'\u315d':u'weo', # ね
                u'\u315e':u'we',  # の
                u'\u315f':u'wi',  # は
                u'\u3160':u'yu', # ば
                u'\u3161':u'eu',  # ぱ
                u'\u3162':u'yi',  # ひ
                u'\u3163':u'i', # び
        }


        re = [];
        for i in s:
                if not hangul.ishangul(i):
                        re.append(i)
                        continue
                t = hangul.split(i)
                c = 0
                for g in t:
                        c = c + 1
                        if g == u'':
                                continue

                        # 段失拭 戚誓戚 臣凶澗 巷製 坦軒
                        if c == 1 and phone[g] == 'ng':
                                continue
                        else: 
                                if phone.has_key(g):
                                        ph = phone[g]
                                        re.append(ph)
        return re