Example #1
0
def sameCheck(input_string1, input_string2):
    # 유니코드 한글을 비교하여 유사도 리턴.
    # 0 - 초,중,종 모두 다름
    # 1 - 한개가 같음
    # 2 - 두개가 같음
    # 3 - 동일한 글자

    # 단일 자음은 종성으로 간주하고 convert_dictionary 를 이용하여 유사도 비교
    convert_dictionary = {
        1: 5,
        4: 8,
        7: 11,
        8: 13,
        16: 21,
        17: 22,
        19: 25,
        21: 27,
        22: 0,
        23: 2,
        24: 3,
        25: 4,
        26: 5,
        27: 6,
        0: 0
    }

    var1 = hangul.separate(input_string1)
    var2 = hangul.separate(input_string2)

    same_point = 0

    if var1[0] == -75:  # 한글이 아닌 기호여서, full match 만 해야 함
        for index in range(0, 3):
            if var1[index] == var2[index]:
                if index == 2:
                    if var1[index] != 0:
                        same_point += 1
                else:
                    same_point += 1
        if same_point == 3:
            return 3
        else:
            return 0
    else:
        if var2[0] == -54 and var2[1] == 11:  # 단독자음 처리
            if convert_dictionary[var1[2]] == var2[2]:
                return 1

        for index in range(0, 3):  # 한글 처리
            if var1[index] == var2[index]:
                if index == 2:
                    if var1[index] != 0:
                        same_point += 1
                else:
                    same_point += 1

    return same_point
Example #2
0
def get_onehot_vector(sent):
    """
    convert sentecne to vector
    :return: list
    """
    try:
        return_vector = []
        embeddings = np.zeros([30])
        idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' ']
        num_reg = re.compile("[0-9- ]")

        if (type(sent) not in [type('str'), type([])]):
            raise Exception("input must be str")

        if (type(sent) == type([])):
            sent = sent[0]

        for char in sent:
            vector_a = np.copy(embeddings)
            vector_b = np.copy(embeddings)
            vector_c = np.copy(embeddings)
            vector_d = np.copy(embeddings)

            if (num_reg.match(char) == None and hangul.is_hangul(char)):
                anl = hangul.separate(char)
                vector_a[anl[0] if anl[0] > 0 else 0] = 1
                vector_b[anl[1] if anl[1] > 0 else 0] = 1
                vector_c[anl[2] if anl[2] > 0 else 0] = 1
            elif (num_reg.match(char)):
                vector_d[idx.index(char)] = 1
            return_vector.append(
                np.append(vector_a, [vector_b, vector_c, vector_d]))
        return np.array(return_vector)
    except Exception as e:
        print("error on get_onehot_vector : {0}".format(e))
Example #3
0
def text2label(string):
    digit_dict = {
        '0': '영',
        '1': '일',
        '2': '이',
        '3': '삼',
        '4': '사',
        '5': '오',
        '6': '육',
        '7': '칠',
        '8': '팔',
        '9': '구'
    }
    char_length = 72

    labels = list()
    for seq in string:
        label = list()
        seq = filter.sub('', seq)
        for i, c in enumerate(seq):
            if c == ' ':
                label.append(70)
            else:
                if c.isdigit():
                    c = digit_dict[c]

                cho, joong, jong = hangul.separate(c)
                label.append(one_hot(cho, char_length))
                label.append(one_hot(joong + 21, char_length))
                if jong: label.append(one_hot(jong + 42, char_length))
        labels.append(label)

    return np.array(labels)
def String_to_Token_List(string):
    #English or special char....
    if re.search(r'[가-힣\d\s\.,\?!]+', string) is None or re.search(
            r'[가-힣\d\s\.,\?!]+', string).group() != string:
        print(string)
        return False

    regex_DtoS = r'(?:^|[^\d])(\d{4})(?:$|[^\d])'
    string = re.sub(
        regex_DtoS,
        lambda x: re.sub(r'\d{4}', lambda y: Number_to_String(y.group()),
                         x.group()), string)

    regex_CtoS1 = r"([+-]?\d[\d,]*)[\.]?\d*"
    regex_CtoS2 = r"(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"
    string = re.sub(
        regex_CtoS1 + regex_CtoS2, lambda x: re.sub(
            regex_CtoS1, lambda y: Count_Number(int(y.group())), x.group()),
        string)

    regex_NtoS = r"([+-]?\d[\d,]*)[\.]?\d*"
    string = re.sub(regex_NtoS, lambda x: Read_Number(int(x.group())), string)

    token_List = []
    token_List.append(0)
    #<EOS>
    for char in string:
        if char == " ":
            token_List.append(2)
            continue
        elif char == ".":
            token_List.append(71)
            continue
        elif char == ",":
            token_List.append(72)
            continue
        elif char == "?":
            token_List.append(73)
            continue
        elif char == "!":
            token_List.append(74)
            continue
        elif hangul.is_hangul(char):
            onset, nucleus, coda = hangul.separate(char)
            onset += 3
            nucleus += 3 + 19
            coda += 3 + 19 + 21
            token_List.extend([onset, nucleus, coda])
        else:
            raise Exception("Not handled letter")

    token_List.append(1)
    #<EOE>

    return token_List
Example #5
0
def convert_hangul_to_index(string):#, size):
	#string = unicode(string)
	list = []#np.ndarray([size, 3])
	for i in range(len(string)):
		#exception
		if not hangul.is_hangul(string[i]):
			continue
		char3 = hangul.separate(string[i])
		idx = char3[0] +  char3[1] * FirNum + char3[2] * FirNum * SecNum
		list.append([idx])
	if len(list)==0:
		list.append([ClassNum - 1])

	return np.array(list)
Example #6
0
    def get_onehot_vector(self, sent):
        """
        convert sentecne to vector
        :return: list
        """
        try:
            return_vector = []
            embeddings = np.zeros([40])
            idx = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' ',
                   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
                   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
            num_reg = re.compile("[a-z0-9- ]")

            if (type(sent) not in [type('str'), type([])]):
                raise Exception("input must be str")

            if (type(sent) == type([])):
                sent = sent[0]

            for char in sent:
                vector_a = np.copy(embeddings)
                vector_b = np.copy(embeddings)
                vector_c = np.copy(embeddings)
                vector_d = np.copy(embeddings)

                if (num_reg.match(char) == None and hangul.is_hangul(char)):
                    anl = hangul.separate(char)
                    vector_a[anl[0] if anl[0] > 0 else 0] = 1
                    vector_b[anl[1] if anl[1] > 0 else 0] = 1
                    vector_c[anl[2] if anl[2] > 0 else 0] = 1
                elif (num_reg.match(char)):
                    vector_d[idx.index(char)] = 1
                else :
                    vector_d[39] = 1
                return_vector.append(np.append(vector_a, [vector_b, vector_c, vector_d]))
            return np.array(return_vector)
        except Exception as e:
            print("error on get_onehot_vector : {0}".format(e))
Example #7
0
def komoran_processing(komoran, complex_verb_set):
    tokenList = list()
    formerTokenTag = ""
    original_token = ""
    word_cnt = len(komoran)
    word_idx = 0
    for phrase in komoran:
        phrase_cnt = len(phrase)
        idx = 0
        word_idx += 1
        tokenWord = ""
        formerTag = ""
        new_check = False
        while idx < phrase_cnt:
            token = phrase[idx].getFirst().replace(" ", "") #remove white space in proper nouns
            if " " not in token and len(token) > 9:
                idx += 1
                new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
                continue
            tag = phrase[idx].getSecond()
            ########## FIRST PHONEME
            if idx == 0 or (idx == 1 and formerTag == "XPN"):       #XPN: 체언접두사
                if formerTokenTag == "REMOVE_EC":
                    tokenList[-1] = "%s " % tokenList[-1]
                # To combine the former token and the current token
                elif formerTokenTag == "MAG":
                    tokenWord = tokenList[-1]
                    del tokenList[-1]
                elif formerTokenTag == "EC":
                    if tag not in {"VV", "VA", "VX"} or (tag in {"VV", "VA", "VX"} and token not in complex_verb_set):
                        tokenList[-1] = original_token
                        formerTokenTag = ""
                        original_token = ""
                # prefix(체언접두사)
                if tag == "XPN":
                    tokenWord = token
                    formerTag = tag
                    idx += 1
                    new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
                    continue
                else:
                    # common noun, proper noun, root adverb and foreign language
                    if tag in {"NNG", "NNP", "SL", "XR", "MAG"}:
                        if tag == "SL":
                            token = token.lower()
                        if tag == "MAG":
                            if token in {"안", "못", "잘못"}:
                                formerTokenTag = "MAG"
                                new_check = True
                            else:
                                idx += 1
                                new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
                                continue
                        elif tag == "NNP":
                            token = "%s " % token
                        elif formerTokenTag == "NNB" and token == "밖":
                            token = "밖에"
                            tokenWord = tokenWord.rstrip("_")
                            formerTokenTag = "JX"
                            new_check = True
                        else:
                            pass
                    # dependent nouns
                    elif tag == "NNB" and token in {"수", "지", "때문"}:
                        if token == "수" and formerTokenTag in {"VV", "VA", "VX"}:
                            formerTokenTag = "NNB1"
                        elif token == "수" and formerTokenTag not in {"VV", "VA", "VX"}:
                            formerTokenTag = ""
                            pass            # 사람의 '수' 등의 형태소분석 오류
                        elif token == "지" and formerTokenTag in {"VV", "VA", "VX"}:
                            formerTokenTag = "NNB2"
                        elif token == "때문":
                            formerTokenTag = "NNB3"
                        else:
                            formerTokenTag = ""
                            new_check = False
                            idx += 1
                            continue
                        new_check = True
                        try:
                            if formerTokenTag.startswith("NNB"):
                                token = "%s_%s" % (tokenList[-1], token)
                                del tokenList[-1]
                            else:
                                pass
                        except IndexError:      # not dependent nouns
                            formerTokenTag = ""
                            new_check = False
                            idx += 1
                            continue
                    # verb, adjective and auxiliary predicate
                    elif tag in ["VV", "VA", "VX"]:
                        if formerTokenTag == "EC":
                            token = "%s%s" % (tokenList[-1], "%s다" % token)
                            del tokenList[-1]
                        elif formerTokenTag == "JX":
                            if token == "없":        # 수밖에_없다
                                token = "_%s다" % token
                            else:
                                token = " %s다" % token
                        elif formerTokenTag.startswith("NNB"):
                            if formerTokenTag.endswith("1") and token in {"있", "없"}:    # 하다_수_있다
                                tokenWord = tokenList[-1]
                                token = "_%s다" % token
                                del tokenList[-1]
                            elif formerTokenTag.endswith("2") and token in {"모르", "않", "말"}:    # 하다_지_모르다
                                tokenWord = tokenList[-1]
                                token = "_%s다" % token
                                del tokenList[-1]
                            elif formerTokenTag.endswith("3"):      # 사랑_때문_자라다
                                tokenWord = tokenList[-1]
                                token = "_%s다" % token
                                del tokenList[-1]
                            else:
                                token = "%s다" % token
                        # only in case negative words
                        elif token in ["않", "없", "못하", "말", "싫", "주"]:
                            try:
                                token = "%s%s" % (tokenList[-1], "_%s다" % token)
                                del tokenList[-1]
                            except IndexError:
                                token = "_%s다" % token
                        else:
                            token = "%s다" % token
                        formerTokenTag = tag
                        new_check = True
                    elif formerTokenTag == "NNB1" and tag == "JX":
                        if token == "밖에":   # 수 밖에 = 수밖에
                            token = "%s%s" % (tokenList[-1].rstrip("_"), token)
                            del tokenList[-1]
                            formerTokenTag = "JX"
                            new_check = True
                        elif token == "도":  # 갈 수도 있다 = 가다_수_있다
                            new_check = True
                    else:
                        # just pass the rest
                        idx += 1
                        formerTag = tag
                        new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
                        continue
                    tokenWord += token
                    formerTag = tag
                    # checking 'formerTokenTag' to be newly assigned
                    new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
            ########## FROM SECOND PHONEME
            else:
                if formerTokenTag == "REMOVE_EC":
                    tokenWord = "%s " % tokenWord
                elif formerTokenTag.startswith("NNB") and tag not in {"VV", "VA", "VX"}:
                    tokenWord = "%s " % tokenWord
                if formerTag in ["ETM", "ETN"]:     # ETM: 관성형전성어미, ETN: 명사형전성어미
                    tokenWord = "%s " % tokenWord
                # common noun, proper noun, root and foreign language, suffix(noun) and dependent noun
                if tag in {"NNG", "NNP", "XR", "SL", "NNB", "XSN"}:
                    if tag == "SL":
                        token = token.lower()
                    if tag == "NNP":
                        if formerTag == "NNP":
                            tokenWord = "%s%s " % (tokenWord, token)
                        else:
                            tokenWord = "%s %s " % (tokenWord, token)
                    elif tag == "NNB":
                        if token in {"수", "지", "때문"}:
                            if token == "수":
                                if formerTag in {"VV", "VA", "VX"}:
                                    tokenWord = "%s_%s" % (tokenWord.strip(), token)
                                    formerTokenTag = "NNB1"
                                else:
                                    tokenWord = "%s %s" % (tokenWord.strip(), token)
                            elif token == "지" and formerTag in {"VV", "VA", "VX"}:
                                tokenWord = "%s_%s" % (tokenWord.strip(), token)
                                formerTokenTag = "NNB2"
                            elif token == "때문":
                                tokenWord = "%s_%s" % (tokenWord.strip(), token)
                                formerTokenTag = "NNB3"
                            else:
                                pass
                            new_check = True
                        else:
                            pass
                    else:
                        tokenWord += token
                # 부정지정사 '아니'
                elif tag == "VCN":
                    tokenWord = "%s_%s다" % (tokenWord, token)
                # 수 밖에 = 수밖에
                elif (formerTokenTag == "NNB1" or formerTag == "NNB1") and tag == "JX":
                    if token == "밖에":
                        tokenWord = "%s%s" % (tokenWord.rstrip("_"), token)
                        formerTokenTag = "JX"
                        new_check = True
                    elif token == "도":
                        new_check = True
                # suffix(adjective and verb)
                elif tag in {"XSA", "XSV"}:
                    token = "%s다" % token
                    tokenWord += token
                # connective endings
                elif tag == "EC":
                    original_token = tokenWord
                    newToken = tokenWord.rstrip("다")
                    try:
                        jong = hangul.separate(newToken[-1])     #마지막 글자 분해
                    except IndexError:
                        idx += 1
                        formerTag = tag
                        new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
                        continue
                    if token in {"아", "어"}:
                        if formerTag == "VCP":
                            idx += 1
                            formerTag = tag
                            new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
                            continue
                        # 받침없음
                        if jong[-1] == 0:
                            if jong[1] == 20 and token == "어":       # 달리+어=달려
                                newJong = hangul.build(jong[0], 6, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                tokenWord = newToken
                            elif jong[1] == 18 and jong[0] == 5:       # '르' 불규칙: 마르+아=말라
                                tmp = hangul.separate(newToken[-2])
                                newJong = hangul.build(tmp[0], tmp[1], 8)
                                tmp2 = hangul.separate(token)
                                newJong2 = hangul.build(5, tmp2[1], tmp2[2])
                                if len(newToken) == 2:      # 마르, 오르, 바르..
                                    newToken = "%s%s" % (newJong, newJong2)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-2], newJong, newJong2)
                                tokenWord = newToken
                            elif jong[1] == 18 and token == "어":       # 'ㅡ' 불규칙: 쓰+어=써
                                newJong = hangul.build(jong[0], 4, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                tokenWord = newToken
                            elif jong[1] == 13 and token == "어":       # 세우+어=세워
                                newJong = hangul.build(jong[0], 14, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                tokenWord = newToken
                            elif jong[1] == 8 and token == "아":     # 따라오+아=따라와
                                newJong = hangul.build(jong[0], 9, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                tokenWord = newToken
                            elif jong[1] == 11 and token == "어":       # 되+어=되어
                                newToken = "%s%s" % (newToken, "어")
                                tokenWord = newToken
                            elif jong == (18,0,0) and token in {"아", "어"}:
                                if len(newToken) == 1:
                                    tokenWord = "해"
                                else:
                                    tokenWord = "%s해" % newToken[:-1]
                            else:
                                tokenWord = newToken        # 펴, 자
                        # 받침있음
                        else:
                            # '빨개야', '파래야' 등의 'ㅎ' 탈락은 형태소분석 자체가 잘 되지 않아 불규칙 적용하지 않음
                            # '묻다'는 '땅에 묻다'와 '물어보다'의 의미가 구분되지 않아 불규칙 적용하지 않음
                            if jong[-1] == 7 and \
                                    (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")):  # ㄷ받침
                                newJong = hangul.build(jong[0], jong[1], 8)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                            elif jong[-1] == 19 and \
                                    (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")):  # ㅅ받침
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                            elif jong[-1] == 17 \
                                    and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"):    # ㅂ받침: 눕+어=누워
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                if token == "어":
                                    token = "워"
                                elif token == "아":
                                    token = "와"
                            tokenWord = "%s%s" % (newToken, token)
                        formerTokenTag = "EC"
                        new_check = True
                    elif token in {"어야", "아야", "어다", "아다"}:
                        ending = token[-1]
                        if jong[-1] == 0:           # 받침 없음
                            if jong[1] == 20 and token.startswith("어"):       # 달려야, 마셔야
                                newJong = hangul.build(jong[0], 6, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1], newJong, ending)
                                tokenWord = newToken
                            elif jong[1] == 18 and jong[0] == 5:       # '르' 불규칙: 마르+아야=말라야
                                tmp = hangul.separate(newToken[-2])
                                newJong = hangul.build(tmp[0], tmp[1], 8)
                                tmp2 = hangul.separate(token[0])
                                newJong2 = hangul.build(5, tmp2[1], tmp2[2])
                                if len(newToken) == 2:      # 마르, 오르, 바르..
                                    newToken = "%s%s%s" % (newJong, newJong2, ending)
                                else:
                                    newToken = "%s%s%s%s" % (newToken[:-2], newJong, newJong2, ending)
                                tokenWord = newToken
                            elif jong[1] == 18 and token.startswith("어"):       # 'ㅡ' 불규칙: 쓰+어야=써야
                                newJong = hangul.build(jong[0], 4, jong[-1])
                                if len(newToken) == 1:
                                    newToken = "%s%s" % (newJong, ending)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1], newJong, ending)
                                tokenWord = newToken
                            elif jong[1] == 13 and token.startswith("어"):       # 세우+어야=세워야
                                newJong = hangul.build(jong[0], 14, jong[-1])
                                if len(newToken) == 1:
                                    newToken = "%s%s" % (newJong, ending)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1], newJong, ending)
                                tokenWord = newToken
                            elif jong[1] == 8 and token.startswith("아"):       # 따라오+아야=따라와야
                                newJong = hangul.build(jong[0], 9, jong[-1])
                                if len(newToken) == 1:
                                    newToken = "%s%s" % (newJong, ending)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1], newJong, ending)
                                tokenWord = newToken
                            elif jong == (18,0,0) and (token.startswith("아") or token.startswith("어")):     # 해야
                                if len(newToken) == 1:
                                    tokenWord = "해%s" % ending
                                else:
                                    tokenWord = "%s해%s" % (newToken[:-1], ending)
                            else:
                                tokenWord = "%s%s" % (newToken, ending)        # 펴야, 자야
                        # 받침 있음
                        elif jong[-1] != 0:
                            if jong[-1] == 17 \
                                    and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"):    # ㅂ받침: 눕+어=누워
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                if token.startswith("어"):       # 아름다워야
                                    tokenWord = "%s워%s" % (newToken, ending)
                                elif token.startswith("아"):     # 고와야
                                    tokenWord = "%s와%s" % (newToken, ending)
                            elif jong[-1] == 19 and \
                                    (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")):  # ㅅ받침
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                tokenWord = "%s%s" % (newToken, token)
                            elif jong[-1] == 7 and \
                                    (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")):      # ㄷ받침
                                newJong = hangul.build(jong[0], jong[1], 8)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1], newJong)
                                tokenWord = "%s%s%s" % (newToken, token[0], ending)
                            else:
                                tokenWord = "%s%s" % (newToken, token)
                        formerTokenTag = "EC"
                        new_check = True
                    elif token.endswith("지") and formerTag in {"VV", "VA", "VX"}:       # 하다_지_모른다
                        tokenWord = "%s다_지_" % newToken
                        formerTokenTag = "EC"
                        new_check = True
                    else:
                        formerTokenTag = "REMOVE_EC"
                        new_check = True
                # verb and adjective
                elif tag in ["VV", "VA", "VX"]:
                    if formerTokenTag == "EC":
                        if tag == "VX" and token == "가지":  # 해가지고, 떠가지고 = 하다, 뜨다
                            tokenWord = original_token
                        else:
                            tokenWord = "%s%s" % (tokenWord, token+"다")
                    elif formerTokenTag.startswith("NNB"):
                        if formerTokenTag.endswith("1") and token in {"있", "없"}:    # 하다_수_있다
                            tokenWord = "%s_%s다" % (tokenWord, token)
                        elif formerTokenTag.endswith("2") and token in {"모르", "않", "말"}:    # 하다_지_모르다
                            tokenWord = "%s_%s다" % (tokenWord, token)
                        elif formerTokenTag.endswith("3"):      # 사랑_때문_자라다
                            tokenWord = "%s_%s다" % (tokenWord, token)
                        else:
                            tokenWord += "%s다" % token
                    elif formerTokenTag == "JX":
                        if token == "없":
                            tokenWord += "_%s다" % token
                        else:
                            tokenWord += " %s다" % token
                    else:
                        if tag in ("VX", "VA") and token in ["않", "없", "못하", "말", "주"]:
                            tokenWord += "_%s다" % token
                        else:   # rest all tag including VX
                            # insert white space if V-V or VA-VA case(grammar error)
                            if formerTokenTag == tag:
                                tokenWord += " %s다" % token
                            else:
                                tokenWord += "%s다" % token
                    formerTokenTag = tag
                    new_check = True
                # adverb
                elif tag == "MAG" and token == "못":
                    tokenWord += "_%s" % token
                elif tag == "MAG" and token == "없이":
                    tokenWord = "%s없다" % tokenWord
                # JKB: 부사격조사
                elif tag == "JKB" and token == "같이":
                    tokenWord = "%s같다" % tokenWord
                else:
                    idx += 1
                    formerTag = tag
                    new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
                    continue
                formerTag = tag
                new_check, formerTokenTag = ending_check(new_check, formerTokenTag, idx, word_idx, phrase_cnt, word_cnt)
            idx += 1
        tokenList.append(tokenWord.strip())
    # if it's the last word in 'Komoran'
    if formerTokenTag == "EC":
        tokenList[-1] = original_token
    finalToken = " ".join([a for a in tokenList if a not in {"", "하다", "암트다"}])        # stop word
    if "안 하다" in finalToken:
        finalToken = finalToken.replace("안 하다", "안하다")
    for neg in {"없다", "않다", "못하다", "안하다"}:
        if neg == finalToken:
            continue
        if " _%s" % neg in finalToken:
            finalToken = finalToken.replace(" _%s" % neg, " %s" % neg)
        if " %s" % neg in finalToken:
            finalToken = finalToken.replace(" %s" % neg, "_%s" % neg)
        if (" _%s" % neg) in finalToken:
            finalToken = finalToken.replace((" _%s" % neg), ("_%s" % neg))
    finalToken = finalToken.strip("_").replace("__", "_").replace(" _", " ").replace("_ ", " ")
    return re.sub(r"\s+", " ", finalToken)
Example #8
0
def test_separation():
    assert hangul.separate(u'가') == (0, 0, 0)
    assert hangul.separate(u'까') == (1, 0, 0)
    assert hangul.separate(u'갸') == (0, 2, 0)
    assert hangul.separate(u'각') == (0, 0, 1)
Example #9
0
def test_separation():
    assert hangul.separate(u"가") == (0, 0, 0)
    assert hangul.separate(u"까") == (1, 0, 0)
    assert hangul.separate(u"갸") == (0, 2, 0)
    assert hangul.separate(u"각") == (0, 0, 1)
Example #10
0
def komoran_processing(komoran, complex_verb_set):
    tokenList = list()
    formerTokenTag = ""
    original_token = ""
    word_cnt = len(komoran)
    word_idx = 0
    for phrase in komoran:
        phrase_cnt = len(phrase)
        idx = 0
        word_idx += 1
        tokenWord = ""
        formerTag = ""
        new_check = False
        while idx < phrase_cnt:
            token = phrase[idx].getFirst().replace(
                " ", "")  #remove white space in proper nouns
            if " " not in token and len(token) > 9:
                idx += 1
                new_check, formerTokenTag = ending_check(
                    new_check, formerTokenTag, idx, word_idx, phrase_cnt,
                    word_cnt)
                continue
            tag = phrase[idx].getSecond()
            ########## FIRST PHONEME
            if idx == 0 or (idx == 1 and formerTag == "XPN"):  #XPN: 체언접두사
                if formerTokenTag == "REMOVE_EC":
                    tokenList[-1] = "%s " % tokenList[-1]
                # To combine the former token and the current token
                elif formerTokenTag == "MAG":
                    tokenWord = tokenList[-1]
                    del tokenList[-1]
                elif formerTokenTag == "EC":
                    if tag not in {"VV", "VA", "VX"
                                   } or (tag in {"VV", "VA", "VX"}
                                         and token not in complex_verb_set):
                        tokenList[-1] = original_token
                        formerTokenTag = ""
                        original_token = ""
                # prefix(체언접두사)
                if tag == "XPN":
                    tokenWord = token
                    formerTag = tag
                    idx += 1
                    new_check, formerTokenTag = ending_check(
                        new_check, formerTokenTag, idx, word_idx, phrase_cnt,
                        word_cnt)
                    continue
                else:
                    # common noun, proper noun, root adverb and foreign language
                    if tag in {"NNG", "NNP", "SL", "XR", "MAG"}:
                        if tag == "SL":
                            token = token.lower()
                        if tag == "MAG":
                            if token in {"안", "못", "잘못"}:
                                formerTokenTag = "MAG"
                                new_check = True
                            else:
                                idx += 1
                                new_check, formerTokenTag = ending_check(
                                    new_check, formerTokenTag, idx, word_idx,
                                    phrase_cnt, word_cnt)
                                continue
                        elif tag == "NNP":
                            token = "%s " % token
                        elif formerTokenTag == "NNB" and token == "밖":
                            token = "밖에"
                            tokenWord = tokenWord.rstrip("_")
                            formerTokenTag = "JX"
                            new_check = True
                        else:
                            pass
                    # dependent nouns
                    elif tag == "NNB" and token in {"수", "지", "때문"}:
                        if token == "수" and formerTokenTag in {
                                "VV", "VA", "VX"
                        }:
                            formerTokenTag = "NNB1"
                        elif token == "수" and formerTokenTag not in {
                                "VV", "VA", "VX"
                        }:
                            formerTokenTag = ""
                            pass  # 사람의 '수' 등의 형태소분석 오류
                        elif token == "지" and formerTokenTag in {
                                "VV", "VA", "VX"
                        }:
                            formerTokenTag = "NNB2"
                        elif token == "때문":
                            formerTokenTag = "NNB3"
                        else:
                            formerTokenTag = ""
                            new_check = False
                            idx += 1
                            continue
                        new_check = True
                        try:
                            if formerTokenTag.startswith("NNB"):
                                token = "%s_%s" % (tokenList[-1], token)
                                del tokenList[-1]
                            else:
                                pass
                        except IndexError:  # not dependent nouns
                            formerTokenTag = ""
                            new_check = False
                            idx += 1
                            continue
                    # verb, adjective and auxiliary predicate
                    elif tag in ["VV", "VA", "VX"]:
                        if formerTokenTag == "EC":
                            token = "%s%s" % (tokenList[-1], "%s다" % token)
                            del tokenList[-1]
                        elif formerTokenTag == "JX":
                            if token == "없":  # 수밖에_없다
                                token = "_%s다" % token
                            else:
                                token = " %s다" % token
                        elif formerTokenTag.startswith("NNB"):
                            if formerTokenTag.endswith("1") and token in {
                                    "있", "없"
                            }:  # 하다_수_있다
                                tokenWord = tokenList[-1]
                                token = "_%s다" % token
                                del tokenList[-1]
                            elif formerTokenTag.endswith("2") and token in {
                                    "모르", "않", "말"
                            }:  # 하다_지_모르다
                                tokenWord = tokenList[-1]
                                token = "_%s다" % token
                                del tokenList[-1]
                            elif formerTokenTag.endswith("3"):  # 사랑_때문_자라다
                                tokenWord = tokenList[-1]
                                token = "_%s다" % token
                                del tokenList[-1]
                            else:
                                token = "%s다" % token
                        # only in case negative words
                        elif token in ["않", "없", "못하", "말", "싫", "주"]:
                            try:
                                token = "%s%s" % (tokenList[-1],
                                                  "_%s다" % token)
                                del tokenList[-1]
                            except IndexError:
                                token = "_%s다" % token
                        else:
                            token = "%s다" % token
                        formerTokenTag = tag
                        new_check = True
                    elif formerTokenTag == "NNB1" and tag == "JX":
                        if token == "밖에":  # 수 밖에 = 수밖에
                            token = "%s%s" % (tokenList[-1].rstrip("_"), token)
                            del tokenList[-1]
                            formerTokenTag = "JX"
                            new_check = True
                        elif token == "도":  # 갈 수도 있다 = 가다_수_있다
                            new_check = True
                    else:
                        # just pass the rest
                        idx += 1
                        formerTag = tag
                        new_check, formerTokenTag = ending_check(
                            new_check, formerTokenTag, idx, word_idx,
                            phrase_cnt, word_cnt)
                        continue
                    tokenWord += token
                    formerTag = tag
                    # checking 'formerTokenTag' to be newly assigned
                    new_check, formerTokenTag = ending_check(
                        new_check, formerTokenTag, idx, word_idx, phrase_cnt,
                        word_cnt)
            ########## FROM SECOND PHONEME
            else:
                if formerTokenTag == "REMOVE_EC":
                    tokenWord = "%s " % tokenWord
                elif formerTokenTag.startswith("NNB") and tag not in {
                        "VV", "VA", "VX"
                }:
                    tokenWord = "%s " % tokenWord
                if formerTag in ["ETM", "ETN"]:  # ETM: 관성형전성어미, ETN: 명사형전성어미
                    tokenWord = "%s " % tokenWord
                # common noun, proper noun, root and foreign language, suffix(noun) and dependent noun
                if tag in {"NNG", "NNP", "XR", "SL", "NNB", "XSN"}:
                    if tag == "SL":
                        token = token.lower()
                    if tag == "NNP":
                        if formerTag == "NNP":
                            tokenWord = "%s%s " % (tokenWord, token)
                        else:
                            tokenWord = "%s %s " % (tokenWord, token)
                    elif tag == "NNB":
                        if token in {"수", "지", "때문"}:
                            if token == "수":
                                if formerTag in {"VV", "VA", "VX"}:
                                    tokenWord = "%s_%s" % (tokenWord.strip(),
                                                           token)
                                    formerTokenTag = "NNB1"
                                else:
                                    tokenWord = "%s %s" % (tokenWord.strip(),
                                                           token)
                            elif token == "지" and formerTag in {
                                    "VV", "VA", "VX"
                            }:
                                tokenWord = "%s_%s" % (tokenWord.strip(),
                                                       token)
                                formerTokenTag = "NNB2"
                            elif token == "때문":
                                tokenWord = "%s_%s" % (tokenWord.strip(),
                                                       token)
                                formerTokenTag = "NNB3"
                            else:
                                pass
                            new_check = True
                        else:
                            pass
                    else:
                        tokenWord += token
                # 부정지정사 '아니'
                elif tag == "VCN":
                    tokenWord = "%s_%s다" % (tokenWord, token)
                # 수 밖에 = 수밖에
                elif (formerTokenTag == "NNB1"
                      or formerTag == "NNB1") and tag == "JX":
                    if token == "밖에":
                        tokenWord = "%s%s" % (tokenWord.rstrip("_"), token)
                        formerTokenTag = "JX"
                        new_check = True
                    elif token == "도":
                        new_check = True
                # suffix(adjective and verb)
                elif tag in {"XSA", "XSV"}:
                    token = "%s다" % token
                    tokenWord += token
                # connective endings
                elif tag == "EC":
                    original_token = tokenWord
                    newToken = tokenWord.rstrip("다")
                    try:
                        jong = hangul.separate(newToken[-1])  #마지막 글자 분해
                    except IndexError:
                        idx += 1
                        formerTag = tag
                        new_check, formerTokenTag = ending_check(
                            new_check, formerTokenTag, idx, word_idx,
                            phrase_cnt, word_cnt)
                        continue
                    if token in {"아", "어"}:
                        if formerTag == "VCP":
                            idx += 1
                            formerTag = tag
                            new_check, formerTokenTag = ending_check(
                                new_check, formerTokenTag, idx, word_idx,
                                phrase_cnt, word_cnt)
                            continue
                        # 받침없음
                        if jong[-1] == 0:
                            if jong[1] == 20 and token == "어":  # 달리+어=달려
                                newJong = hangul.build(jong[0], 6, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                tokenWord = newToken
                            elif jong[1] == 18 and jong[
                                    0] == 5:  # '르' 불규칙: 마르+아=말라
                                tmp = hangul.separate(newToken[-2])
                                newJong = hangul.build(tmp[0], tmp[1], 8)
                                tmp2 = hangul.separate(token)
                                newJong2 = hangul.build(5, tmp2[1], tmp2[2])
                                if len(newToken) == 2:  # 마르, 오르, 바르..
                                    newToken = "%s%s" % (newJong, newJong2)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-2],
                                                           newJong, newJong2)
                                tokenWord = newToken
                            elif jong[
                                    1] == 18 and token == "어":  # 'ㅡ' 불규칙: 쓰+어=써
                                newJong = hangul.build(jong[0], 4, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                tokenWord = newToken
                            elif jong[1] == 13 and token == "어":  # 세우+어=세워
                                newJong = hangul.build(jong[0], 14, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                tokenWord = newToken
                            elif jong[1] == 8 and token == "아":  # 따라오+아=따라와
                                newJong = hangul.build(jong[0], 9, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                tokenWord = newToken
                            elif jong[1] == 11 and token == "어":  # 되+어=되어
                                newToken = "%s%s" % (newToken, "어")
                                tokenWord = newToken
                            elif jong == (18, 0, 0) and token in {"아", "어"}:
                                if len(newToken) == 1:
                                    tokenWord = "해"
                                else:
                                    tokenWord = "%s해" % newToken[:-1]
                            else:
                                tokenWord = newToken  # 펴, 자
                        # 받침있음
                        else:
                            # '빨개야', '파래야' 등의 'ㅎ' 탈락은 형태소분석 자체가 잘 되지 않아 불규칙 적용하지 않음
                            # '묻다'는 '땅에 묻다'와 '물어보다'의 의미가 구분되지 않아 불규칙 적용하지 않음
                            if jong[-1] == 7 and \
                                    (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")):  # ㄷ받침
                                newJong = hangul.build(jong[0], jong[1], 8)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                            elif jong[-1] == 19 and \
                                    (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")):  # ㅅ받침
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                            elif jong[-1] == 17 \
                                    and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"):    # ㅂ받침: 눕+어=누워
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                if token == "어":
                                    token = "워"
                                elif token == "아":
                                    token = "와"
                            tokenWord = "%s%s" % (newToken, token)
                        formerTokenTag = "EC"
                        new_check = True
                    elif token in {"어야", "아야", "어다", "아다"}:
                        ending = token[-1]
                        if jong[-1] == 0:  # 받침 없음
                            if jong[1] == 20 and token.startswith(
                                    "어"):  # 달려야, 마셔야
                                newJong = hangul.build(jong[0], 6, jong[-1])
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1],
                                                           newJong, ending)
                                tokenWord = newToken
                            elif jong[1] == 18 and jong[
                                    0] == 5:  # '르' 불규칙: 마르+아야=말라야
                                tmp = hangul.separate(newToken[-2])
                                newJong = hangul.build(tmp[0], tmp[1], 8)
                                tmp2 = hangul.separate(token[0])
                                newJong2 = hangul.build(5, tmp2[1], tmp2[2])
                                if len(newToken) == 2:  # 마르, 오르, 바르..
                                    newToken = "%s%s%s" % (newJong, newJong2,
                                                           ending)
                                else:
                                    newToken = "%s%s%s%s" % (newToken[:-2],
                                                             newJong, newJong2,
                                                             ending)
                                tokenWord = newToken
                            elif jong[1] == 18 and token.startswith(
                                    "어"):  # 'ㅡ' 불규칙: 쓰+어야=써야
                                newJong = hangul.build(jong[0], 4, jong[-1])
                                if len(newToken) == 1:
                                    newToken = "%s%s" % (newJong, ending)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1],
                                                           newJong, ending)
                                tokenWord = newToken
                            elif jong[1] == 13 and token.startswith(
                                    "어"):  # 세우+어야=세워야
                                newJong = hangul.build(jong[0], 14, jong[-1])
                                if len(newToken) == 1:
                                    newToken = "%s%s" % (newJong, ending)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1],
                                                           newJong, ending)
                                tokenWord = newToken
                            elif jong[1] == 8 and token.startswith(
                                    "아"):  # 따라오+아야=따라와야
                                newJong = hangul.build(jong[0], 9, jong[-1])
                                if len(newToken) == 1:
                                    newToken = "%s%s" % (newJong, ending)
                                else:
                                    newToken = "%s%s%s" % (newToken[:-1],
                                                           newJong, ending)
                                tokenWord = newToken
                            elif jong == (18, 0,
                                          0) and (token.startswith("아") or
                                                  token.startswith("어")):  # 해야
                                if len(newToken) == 1:
                                    tokenWord = "해%s" % ending
                                else:
                                    tokenWord = "%s해%s" % (newToken[:-1],
                                                           ending)
                            else:
                                tokenWord = "%s%s" % (newToken, ending
                                                      )  # 펴야, 자야
                        # 받침 있음
                        elif jong[-1] != 0:
                            if jong[-1] == 17 \
                                    and newToken[-1] not in ("입", "잡", "씹", "좁", "접", "뽑"):    # ㅂ받침: 눕+어=누워
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                if token.startswith("어"):  # 아름다워야
                                    tokenWord = "%s워%s" % (newToken, ending)
                                elif token.startswith("아"):  # 고와야
                                    tokenWord = "%s와%s" % (newToken, ending)
                            elif jong[-1] == 19 and \
                                    (newToken[-1] in ("긋", "낫", "붓", "잇", "젓", "짓")):  # ㅅ받침
                                newJong = hangul.build(jong[0], jong[1], 0)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                tokenWord = "%s%s" % (newToken, token)
                            elif jong[-1] == 7 and \
                                    (newToken[-1] in ("걷", "싣", "듣") or newToken[-2:] in ("깨닫", "일컫")):      # ㄷ받침
                                newJong = hangul.build(jong[0], jong[1], 8)
                                if len(newToken) == 1:
                                    newToken = newJong
                                else:
                                    newToken = "%s%s" % (newToken[:-1],
                                                         newJong)
                                tokenWord = "%s%s%s" % (newToken, token[0],
                                                        ending)
                            else:
                                tokenWord = "%s%s" % (newToken, token)
                        formerTokenTag = "EC"
                        new_check = True
                    elif token.endswith("지") and formerTag in {
                            "VV", "VA", "VX"
                    }:  # 하다_지_모른다
                        tokenWord = "%s다_지_" % newToken
                        formerTokenTag = "EC"
                        new_check = True
                    else:
                        formerTokenTag = "REMOVE_EC"
                        new_check = True
                # verb and adjective
                elif tag in ["VV", "VA", "VX"]:
                    if formerTokenTag == "EC":
                        if tag == "VX" and token == "가지":  # 해가지고, 떠가지고 = 하다, 뜨다
                            tokenWord = original_token
                        else:
                            tokenWord = "%s%s" % (tokenWord, token + "다")
                    elif formerTokenTag.startswith("NNB"):
                        if formerTokenTag.endswith("1") and token in {
                                "있", "없"
                        }:  # 하다_수_있다
                            tokenWord = "%s_%s다" % (tokenWord, token)
                        elif formerTokenTag.endswith("2") and token in {
                                "모르", "않", "말"
                        }:  # 하다_지_모르다
                            tokenWord = "%s_%s다" % (tokenWord, token)
                        elif formerTokenTag.endswith("3"):  # 사랑_때문_자라다
                            tokenWord = "%s_%s다" % (tokenWord, token)
                        else:
                            tokenWord += "%s다" % token
                    elif formerTokenTag == "JX":
                        if token == "없":
                            tokenWord += "_%s다" % token
                        else:
                            tokenWord += " %s다" % token
                    else:
                        if tag in ("VX", "VA") and token in [
                                "않", "없", "못하", "말", "주"
                        ]:
                            tokenWord += "_%s다" % token
                        else:  # rest all tag including VX
                            # insert white space if V-V or VA-VA case(grammar error)
                            if formerTokenTag == tag:
                                tokenWord += " %s다" % token
                            else:
                                tokenWord += "%s다" % token
                    formerTokenTag = tag
                    new_check = True
                # adverb
                elif tag == "MAG" and token == "못":
                    tokenWord += "_%s" % token
                elif tag == "MAG" and token == "없이":
                    tokenWord = "%s없다" % tokenWord
                # JKB: 부사격조사
                elif tag == "JKB" and token == "같이":
                    tokenWord = "%s같다" % tokenWord
                else:
                    idx += 1
                    formerTag = tag
                    new_check, formerTokenTag = ending_check(
                        new_check, formerTokenTag, idx, word_idx, phrase_cnt,
                        word_cnt)
                    continue
                formerTag = tag
                new_check, formerTokenTag = ending_check(
                    new_check, formerTokenTag, idx, word_idx, phrase_cnt,
                    word_cnt)
            idx += 1
        tokenList.append(tokenWord.strip())
    # if it's the last word in 'Komoran'
    if formerTokenTag == "EC":
        tokenList[-1] = original_token
    finalToken = " ".join([a for a in tokenList
                           if a not in {"", "하다", "암트다"}])  # stop word
    if "안 하다" in finalToken:
        finalToken = finalToken.replace("안 하다", "안하다")
    for neg in {"없다", "않다", "못하다", "안하다"}:
        if neg == finalToken:
            continue
        if " _%s" % neg in finalToken:
            finalToken = finalToken.replace(" _%s" % neg, " %s" % neg)
        if " %s" % neg in finalToken:
            finalToken = finalToken.replace(" %s" % neg, "_%s" % neg)
        if (" _%s" % neg) in finalToken:
            finalToken = finalToken.replace((" _%s" % neg), ("_%s" % neg))
    finalToken = finalToken.strip("_").replace("__",
                                               "_").replace(" _", " ").replace(
                                                   "_ ", " ")
    return re.sub(r"\s+", " ", finalToken)
Example #11
0
def test_separation():
    assert hangul.separate(u'가') == (0, 0, 0)
    assert hangul.separate(u'까') == (1, 0, 0)
    assert hangul.separate(u'갸') == (0, 2, 0)
    assert hangul.separate(u'각') == (0, 0, 1)