def trans_symbol(symbol,prev_term,next_term):
    if symbol in count_symbols:
        return count_sym_han[count_symbols.index(symbol)]
    elif prev_term not in puncs:
        if hgtk.checker.is_hangul(prev_term) or hgtk.checker.is_hangul(next_term):
            return sym_han[symbols.index(symbol)]
        elif prev_term.isdigit() or next_term.isdigit():
            return sym_han[symbols.index(symbol)]
        elif real_latin(prev_term) or real_latin(next_term):
            return sym_pro[symbols.index(symbol)]
        else:
            return ''
    else:
        return ''
def trans_number(n,prev_term,next_term): ## Context-given number reading
    if hgtk.checker.is_hangul(prev_term) and hgtk.checker.is_hangul(next_term):
        return readNumberKor(n,next_term)
    elif real_latin(prev_term) or real_latin(next_term):
        if hgtk.checker.is_hangul(next_term) and n>10:
            return readNumberKor(n,next_term)
        else:
            return readNumberEng(n)
    else: ## Maybe hanja
        if prev_term in symbols or next_term in symbols:
            return readOnlyNum(n)
        elif n > 99999:
            return readBigNum(n)
        else:
            return readNumber(n)
def leftword(chunks):
    for i in range(len(chunks)):
        eojeol = chunks[i]
        for j in range(len(eojeol)):
            term = chunks[i][j]
            if real_latin(term):
                chunks[i][j] = read_acronym(term)
            elif not hgtk.checker.is_hangul(term) and term not in puncs:
                chunks[i][j] = ''
    return chunks
Beispiel #4
0
def trans_eojeol(chunks,
                 chunks_4num,
                 metadata,
                 if_num=True,
                 if_sym=True,
                 if_han=True,
                 if_eng=True,
                 if_puncs=True,
                 if_else=True):
    for i in range(len(chunks)):
        eojeol = chunks[i]
        for j in range(len(eojeol)):
            term = eojeol[j]
            if term.isdigit():
                if if_num:
                    term = int(term)
                    x, y = decide_context(term, chunks_4num, eojeol, i, j)
                    chunks[i][j] = trans_number(term, x,
                                                y)  ## Reflects context
                else:
                    chunks[i][j] = term
            elif term in symbols + count_symbols and i + j > 0:  ## Symbols not sentence-first
                if if_sym:
                    x, y = decide_context(term, chunks_4num, eojeol, i, j)
                    chunks[i][j] = trans_symbol(term, x,
                                                y)  ## Currently bypassing
                else:
                    chunks[i][j] = term
            elif hgtk.checker.is_hanja(term):
                if if_han:
                    chunks[i][j] = trans_hanja(term)  ## Double check
                else:
                    chunks[i][j] = term
            elif real_latin(term):
                if if_eng:
                    chunks[i][j] = trans_latin(
                        term)  ## Transliteration (or bypassing)
                else:
                    chunks[i][j] = term
            elif term in puncs:
                if if_puncs:
                    chunks[i][j] = term  ## Bypassing by default
                else:
                    chunks[i][j] = ''
            elif hgtk.checker.is_hangul(term):
                chunks[i][j] = term  ## Bypassing by default
            else:
                if if_else:
                    chunks[i][
                        j] = term  # '' ## Currently bypassing but able to delete
                else:
                    chunks[i][j] = ''
    return chunks