Example #1
0
def do_words_suit(a, b):
    if a[0] == b[0]:
        return True

    a_letters = jamo.decompose(a[0])
    b_letters = jamo.decompose(b[0])
    # skip dummy initial
    if a_letters[0] == 'ᄋ' and b_letters[0] == 'ᄋ':
        return a_letters[1] == b_letters[1]
    else:
        return a_letters[0] == b_letters[0]
Example #2
0
def vocab(conll_path):
    jamosCount = Counter()
    charsCount = Counter()
    wordsCount = Counter()
    posCount = Counter()
    relCount = Counter()

    with open(conll_path, 'r') as conllFP:
        for sentence in read_conll(conllFP, True):
            chars = []
            for node in sentence:
                if (node.norm == "*root*"): continue  # No morphology there...
                for char in unicode(node.norm, "utf-8"):
                    jamosCount.update(decompose(char))
                    chars.append(char)
            charsCount.update(chars)
            wordsCount.update([node.norm for node in sentence])
            posCount.update([node.pos for node in sentence])
            relCount.update([node.relation for node in sentence])

    return (jamosCount, {j: i
                         for i, j in enumerate(jamosCount.keys())}, charsCount,
            {c: i
             for i, c in enumerate(charsCount.keys())
             }, wordsCount, {w: i
                             for i, w in enumerate(wordsCount.keys())},
            posCount.keys(), relCount.keys())
def get_present_determiner(word):
    stem1 = stem2.get_stem1(word)
    letters = jamo.decompose(stem1[-1])
    if len(letters) == 3 and letters[2] == stem2.final_l:
        return stem1[:-1] + jamo.compose(letters[0], letters[1], None) + '는'
    else:
        return stem1 + '는'
def get_plain_interrogative(word: str):
    stem1 = stem2.get_stem1(word)
    letters = jamo.decompose(stem1[-1])
    if len(letters) == 3 and letters[2] == stem2.final_l:
        return stem1[:-1] + jamo.compose(letters[0], letters[1],
                                         None) + '니'  # '냐'
    else:
        return stem1 + '니'  # '으' + '냐'
Example #5
0
def get_leu_irregular_stem2(prefix):
    if len(prefix) == 0:
        raise RuntimeError('르다 is not a verb')
    letters = jamo.decompose(prefix[-1])
    if len(letters) == 3:
        raise RuntimeError(f'{prefix}르다 is not a 르 verb')
    return prefix[:-1] + jamo.compose(letters[0], letters[1], final_l) + (
        '라' if is_bright_vowel(letters[1]) else '러')
def get_eupsi(word):
    stem1 = stem2.get_stem1(word)
    letters = jamo.decompose(stem1[-1])
    if len(letters) == 3 and letters[2] != stem2.final_l:
        return stem1 + '읍시'
    else:
        return stem1[:-1] + jamo.compose(letters[0], letters[1],
                                         stem2.final_p) + '시'
def get_seumni(word):
    stem1 = stem2.get_stem1(word)
    letters = jamo.decompose(stem1[-1])
    if len(letters) == 3 and letters[2] != stem2.final_l:
        return stem1 + polite_formal_suffix
    else:
        return stem1[:-1] + jamo.compose(letters[0], letters[1],
                                         stem2.final_p) + '니'
Example #8
0
def get_eu_stem2(stem1, initial):
    if stem1[-1] == '쓰':  # 쓰다 derivatives always conjugate as 쓰다
        return stem1[:-1] + '써'
    elif len(stem1) > 1:
        letters = jamo.decompose(stem1[-2])
        if letters[1] == 'ᅡ':
            return stem1[:-1] + jamo.compose(initial, 'ᅡ', None)
        else:
            return stem1[:-1] + jamo.compose(initial, 'ᅥ', None)
def get_plain(word: str, adjective: bool):
    if adjective:
        return word
    stem1 = stem2.get_stem1(word)
    letters = jamo.decompose(stem1[-1])
    if len(letters) == 2 or letters[2] == stem2.final_l:
        return stem1[:-1] + jamo.compose(
            letters[0], letters[1],
            stem2.final_n) + word_ending  # final l -> n
    else:
        return stem1 + '는' + word_ending
 def get(word, tense: int, irregular: bool):
     if tense == NounForm.PRESENT:
         st1 = stem2.get_stem1(word)
         letters = jamo.decompose(st1[-1])
         if len(letters) == 3 and letters[2] == stem2.final_l:
             nominalization = (st1[:-1] +
                               jamo.compose(letters[0], letters[1], 'ᆱ'))
         else:
             nominalization = stem2.get_stem1(word) + '음'
         return [nominalization, stem2.get_stem1(word) + '기']
     elif tense == NounForm.PAST:
         past = get_past(word, irregular)
         return [past + '음', past + '기']
Example #11
0
def get_regular_stem2(stem1):
    letters = jamo.decompose(stem1[-1])
    vowel = letters[1]
    if len(letters) == 3:
        # if vowel in ('ᅩ', 'ᅣ', 'ᅪ'):
        return stem1 + ('아' if is_bright_vowel(vowel) else '어')
    else:
        if letters[1] == 'ᅳ':  # consider not irregular
            return get_eu_stem2(stem1, letters[0])
        replace_to = {'ᅩ': 'ᅪ', 'ᅮ': 'ᅯ', 'ᅵ': 'ᅧ'}
        vowel_to = replace_to.get(vowel)
        if vowel_to:
            return stem1[:-1] + jamo.compose(letters[0], vowel_to, None)
        else:
            return stem1
Example #12
0
def get_irregular_stem3(stem1):
    letters = jamo.decompose(stem1[-1])
    if len(letters) == 3:
        if letters[2] == stem2.final_s:
            return stem1[:-1] + jamo.compose(letters[0], letters[1],
                                             None) + stem3_final
        elif letters[2] == stem2.final_t:
            return stem1[:-1] + jamo.compose(letters[0], letters[1],
                                             stem2.final_l) + stem3_final
        elif letters[2] == stem2.final_p:
            return stem1[:-1] + jamo.compose(letters[0], letters[1],
                                             None) + '우'
        elif letters[2] == stem2.final_l:
            return stem1[:-1] + jamo.compose(letters[0], letters[1], None)
    raise RuntimeError(
        f'{stem2.stem1_to_word(stem1)} cannot be irregular verb')
Example #13
0
def get_irregular_stem2(stem1):
    letters = jamo.decompose(stem1[-1])
    if stem1[-1] == '르':
        return get_leu_irregular_stem2(stem1[:-1])
    elif len(letters) == 2 and letters[1] == 'ᅳ':
        return get_eu_stem2(stem1, letters[0])
    elif len(letters) == 3:
        if letters[2] == final_t:
            return get_t_irregular_stem2(stem1[:-1], letters[0], letters[1])
        elif letters[2] == final_l:
            return get_regular_stem2(stem1)
        elif letters[2] == final_p:
            return get_p_irregular_stem2(stem1[:-1], letters[0], letters[1])
        elif letters[2] == final_s:
            return get_s_irregular_stem2(stem1[:-1], letters[0], letters[1])
        elif letters[2] == final_h:
            return get_h_irregular_stem2(stem1[:-1], letters[0])
    else:
        raise RuntimeError(f'{stem1}다 cannot be irregular')
def get_past_and_future_determiner(word, irregular, regular_ending,
                                   p_irregular_ending, ending_final):
    stem1 = stem2.get_stem1(word)
    letters = jamo.decompose(stem1[-1])
    if irregular and len(letters) == 3:
        if letters[2] == stem2.final_s:
            return stem1[:-1] + jamo.compose(
                letters[0], letters[1], None) + regular_ending  # s removed
        elif letters[2] == stem2.final_t:
            return stem1[:-1] + jamo.compose(
                letters[0], letters[1],
                stem2.final_l) + regular_ending  # t -> l
        elif letters[2] == stem2.final_p:
            return stem1[:-1] + jamo.compose(
                letters[0], letters[1],
                None) + p_irregular_ending  # p -> un/ul

    if len(letters) == 2 or letters[2] == stem2.final_l or letters[
            2] == stem2.final_h:
        return stem1[:-1] + jamo.compose(letters[0], letters[1], ending_final)
    else:
        return stem1 + regular_ending
Example #15
0
    def getJamoVec(self, char, train):
        if not char in self.jamo_cache:
            self.jamo_cache[char] = decompose(char)
        jamos = self.jamo_cache[char]

        if len(jamos) == 1:  # Non-Hangul (ex: @, Q)
            symbol = jamos[0]
            symbol_count = float(self.jamosCount.get(symbol, 0))
            dropFlag = not train or \
                (random.random() < (symbol_count/(0.25+symbol_count)))
            # 0: unknown symbol
            jamo_index = int(self.jvocab.get(symbol, 0)) if dropFlag else 0
            return self.jamoLookup[jamo_index]

        # Hangul character
        jamo1vec = self.keepOrDropJamo(jamos[0], train)
        jamo2vec = self.keepOrDropJamo(jamos[1], train)
        jamo3vec = self.keepOrDropJamo(jamos[2], train) if len(jamos) > 2 else \
            self.jamoLookup[2]  # 2: empty consonant
        jamoinput = concatenate([ jamo1vec, jamo2vec, jamo3vec ])
        jamovec = self.activation(self.jamoLayer.expr() * jamoinput + self.jamoBias.expr())

        return jamovec
Example #16
0
}

if __name__ == '__main__':
    jamos, j2i, chars, c2i, words, w2i, pos, rels = vocab(sys.argv[1])
    print
    print '# words: ', len(w2i)
    print ' '.join(words.keys()[-min(100, len(words)):])
    print

    print '# chars: ', len(c2i)
    print ' '.join(chars.keys()[-min(100, len(chars)):])
    print

    hangul_chars = {}
    for char in chars:
        if len(decompose(char)) > 1:
            hangul_chars[char] = True

    print '# Hangul chars: ', len(hangul_chars)
    print ' '.join(hangul_chars.keys()[-min(100, len(hangul_chars)):])
    print

    print '# jamos: ', len(j2i)
    print ' '.join(jamos.keys()[-min(100, len(jamos)):])
    print

    hangul_jamos = {}
    for jamo in jamos:
        if is_jamo(jamo):
            hangul_jamos[jamo] = True
Example #17
0
jamos_train, j2i_train, chars_train, c2i_train, words_train, w2i_train, pos_train, rels_train = utils.vocab(
    sys.argv[1])
jamos_dev, j2i_dev, chars_dev, c2i_dev, words_dev, w2i_dev, pos_dev, rels_dev = utils.vocab(
    sys.argv[2])

oov_word = 0
for word in words_dev:
    if not word in words_train:
        oov_word += 1
print 'OOV word: ', oov_word, ' / ', len(
    words_dev), ' ', float(oov_word) / len(words_dev) * 100

hangul_chars_train = {}
for char in chars_train:
    if len(jpack.decompose(char)) > 1:
        hangul_chars_train[char] = True
hangul_chars_dev = {}
for char in chars_dev:
    if len(jpack.decompose(char)) > 1:
        hangul_chars_dev[char] = True

oov_char = 0
for char in hangul_chars_dev:
    if not char in hangul_chars_train:
        oov_char += 1
print 'OOV char: ', oov_char, ' / ', len(
    hangul_chars_dev), ' ', float(oov_char) / len(hangul_chars_dev) * 100

hangul_jamos_train = {}
for jamo in jamos_train:
Example #18
0
            num_new_jamos = 0
            for word in external_embedding:  # expand word vocab
                if not word in words:
                    num_new_words += 1
                    words[word] = 1
                    new_w = len(w2i)
                    w2i[word] = new_w

                for char in unicode(word, "utf-8"):  # expand char vocab
                    if not char in chars:
                        num_new_chars += 1
                        chars[char] = 1
                        new_c = len(c2i)
                        c2i[char] = new_c

                    for jamo in decompose(char):  # expand jamo vocab
                        if not jamo in jamos:
                            num_new_jamos += 1
                            jamos[jamo] = 1
                            new_j = len(j2i)
                            j2i[jamo] = new_j
            print 'Have {0} new words, {1} new chars, {2} new jamos from pretrained embeddings'.format(
                num_new_words, num_new_chars, num_new_jamos)

        if not os.path.exists(options.output):
            os.makedirs(options.output)  # Make directory if needed

        with open(os.path.join(options.output, "params.pickle"),
                  'w') as paramsfp:
            pickle.dump(
                (jamos, j2i, chars, c2i, words, w2i, pos, rels, options),
Example #19
0
def get_regular_stem3(stem1):
    letters = jamo.decompose(stem1[-1])
    if len(letters) == 3:
        return stem1 + stem3_final
    else:
        return stem1
Example #20
0
def recover_particle(word):
   I, V, F = jamo.decompose(word[-1])
   if F == 'ᆻ':
      return 'ᆻ'
   else:
      return word[-1]
def get_past(word, irregular):
    stem = stem2.get_stem2(word, irregular)
    letters = jamo.decompose(stem[-1])
    assert (len(letters) == 2)
    return stem[:-1] + jamo.compose(letters[0], letters[1], final_ss)