Ejemplo n.º 1
0
class TagParser:
    def __init__(self):
        self.kkma = Kkma()
        self.model = fasttext._load_fasttext_format("../fasttext.bin")

    def parse_tag(self, sentence):
        '''
        문장에서 태그를 파싱한다
        '''
        posed_sentence = self.kkma.pos(sentence) # 문장을 형태소 단위로 쪼갠다
        xr_list = []
        nng_va_list = [] # VA + NNG
        nng_va_concat = []  # VA와 NNG가 서로 짝을 찾아 합쳐진 리스트

        # XR(어근)만 찾아서 빼낸다
        for index,i in enumerate(posed_sentence):
            if i[1]=='XR':
                xr_list.append(i[0])

        # VA(형용사) 과 NNG(보통 명사) 만 빼내서 저장
        for index,i in enumerate(posed_sentence):
            if i[1]=='NNG' or i[1]=='VA':
                nng_va_list.append(i)

        # VA 인덱스의 -1 번째 항이 NNG 일경우 합친것을 다른 배열에 저장.
        for index,i in enumerate(nng_va_list):
            if nng_va_list[index][1] == 'VA':
                if index == 0:
                    continue
                elif nng_va_list[index-1][1]=='NNG':
                    nng_va_concat.append(nng_va_list[index-1][0] + " " + nng_va_list[index][0]+"음")

        # 만약 그렇지 않을 경우 패스// EX) 넓음
        # XR[] 과 VA[] 합치기
        return xr_list + nng_va_concat

    def pos(self, sentence):
        '''
        문장을 형태소 별로 토큰화
        '''
        return self.kkma.pos(sentence)

    def most_sim(self, word):
        '''
        가장 비슷한 단어 추출
        '''
        return self.model.wv.most_similar(word)

    def wmd(self, tag1, tag2):
        '''
        tag1와 tag2 사이의 거리를 리턴
        '''
        return self.model.wv.wmdistance(tag1, tag2)
class UsecaseFinder:
    def __init__(self):
        self.keyword = keyword
        self.kkma = Kkma()
        self.kkma.pos(u'성능을 위한 더미 데이터')  # this is dummy for performance
        self.usecase = []
        self.parsingNVLIST = []
        self.parsingLIST = []
        self.dic = {}

    def getNV(self, sentence):
        self.parsingNVLIST = []
        self.parsingLIST = []
        data = self.kkma.pos(sentence)
        # print(data)
        for list in data:
            if 'N' in list[1] or list[1].count(
                    'V') > 1 or 'C' in list[1] or 'JKM' in list[1]:
                self.parsingNVLIST.append(list[0])
                self.parsingLIST.append(list)
        print("\nkkma_LOG %s" % self.parsingNVLIST)

    def setUserSetting(self):
        self.setUsecase("feed", ["먹이", "사료", "먹을 것", "밥", "배식"],
                        ["주", "줘", "급여", "배식", "먹"], util.GRAVITY_N)
        self.setUsecase("open", ["문", "입구"], ["열", "오픈", "개방"], util.GRAVITY_N)
        self.setUsecase("camera", ["사진", "상황", "모습", "얼굴", "현황"],
                        ["보여주", "찍", "알리"], util.GRAVITY_ALL)
        self.setUsecase("regist", ["등록"], ["등록"], util.GRAVITY_ANY)
        self.setUsecase("howToUse", ["사용법", '도우미', "도움말"],
                        ["사용법", '도우미', "도움말"], util.GRAVITY_ANY)

    def printList(self):
        for item in self.usecase:
            item._print()

    def analyzeSentence(self, sentence):
        request = []
        self.getNV(sentence)
        for item in self.usecase:
            if item.isMe(self.parsingNVLIST, self.parsingLIST) == True:
                request.append(item.keyWord)

        return request

    def setUsecase(self, name, nList, vList, gravity):
        self.usecase.append(self.keyword.KeyWord(name))
        keyword = self.usecase[len(self.usecase) - 1]
        keyword.setNouns(nList)
        keyword.setGravity(gravity)
        keyword.setGravity(gravity)
        keyword.setVerbs(vList)
Ejemplo n.º 3
0
def run_kkma():
    kkma = Kkma()
    start_time = time.time()
    print('kkma 시작')
    kkma_morphs = kkma.morphs(news1)
    kkma_nouns = kkma.nouns(news1)
    kkma_pos = kkma.pos(news1)
    end_time = time.time()
    print('kkma 끝 - %s 초' % str(end_time - start_time))
    kkma_sentences = kkma.sentences(news1)

    with open('kkma.txt', 'w', encoding='utf-8') as fstream:
        fstream.write('kkma time : %s s\n' % str(end_time - start_time))
        fstream.write('kkma_morphs\n')
        write_list(kkma_morphs, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_nouns\n')
        write_list(kkma_nouns, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_pos\n')
        write_pos(kkma_pos, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_sentences\n')
        write_list(kkma_sentences, fstream)
        fstream.write('\n')
Ejemplo n.º 4
0
def tagPOS(filename):
    try:
        # Read file
        f = open(filename, 'r')
        text = f.read().decode('utf-8') # read file as utf8 decoded
        f.close()
        
        # tagging
        from konlpy.tag import Kkma
        #from konlpy.utils import pprint
        kkma = Kkma()
        print ('now tagging...')
        tagged = kkma.pos(text)
        
        # Write tagged file
        (path,fnameExt) = os.path.split(filename)
        (fname,fext) = os.path.splitext(fnameExt)
        tagged_file = fname+'_'+'tagged'+fext
        fw = open(tagged_file,'w')
        for line in tagged:
            strs="\t".join(x for x in line).encode('utf-8')
            fw.write(strs+"\n")
        fw.close()
        print '%s is created' % (tagged_file)
    except:
        print '\nERROR:'
        print '"%s" is not a valid text\nCheck your text file\nor file path\n' % (filename)
        sys.exit(1)
    def get_kkma_token(self):
        kkma = Kkma()
        self.kkmaTag = kkma.pos(self.doc)

        for i in self.kkmaTag:
            if i[1] == 'NNG':
                self.kkma_tokens.append(i[0])
Ejemplo n.º 6
0
def get_morphemes(post_detail_cont):
    kkma = Kkma()
    results = []
    except_word_list = []
    # 특수문자 제거
    detail = remove_sc(str(post_detail_cont))
    # 어절 분리 > list
    origin_word_list = list(
        dict.fromkeys(
            regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+',
                          f'{detail}')))
    if len(origin_word_list) > 100:
        del origin_word_list[100:len(origin_word_list)]

    # origin_word_list = ['안녕하세요', '문의사항이', '있습니다']
    for origin_word in origin_word_list:
        if (origin_word not in except_word_list):
            for morpheme in kkma.pos(origin_word):
                in_result = []
                in_result.append(origin_word)
                in_result.append(morpheme)
                results.append(in_result)

    # results = [['자동차', ('자동차', 'NNG')], ['고장', ('고장', 'NNG')], ['진단', ('진단', 'NNG')], ['APP', ('APP', 'OL')], ['개발', ('개발', 'NNG')], ['데이터', ('데이터', 'NNG')], ['수집', ('수집', 'NNG')]]
    return results
Ejemplo n.º 7
0
class konlpy_test:
    def __init__(self):
        self.kkma = Kkma()

    def split_sen(self, text):
        words = [x[0] for x in self.kkma.pos(text) if x[1] == 'NNG']
        return words
Ejemplo n.º 8
0
	def run(self) :
		print ('KonLpy Start')
		fileName	= os.path.basename(self.path)
		compFilePath	= os.path.join(self.FILE_PATH, fileName)
	
		fileText = ''
		fileTextList = ''

		kkma	= Kkma()

		with open(self.path, 'r', encoding='utf-8') as readFile :
			fileTextList =	readFile.readlines()

		
		with open(compFilePath, 'w', encoding='utf-8') as comFile :
			result = None

			for fileText in fileTextList :
				if not(fileText == None or fileText.strip() == '') :
					try :
						result 	= kkma.pos(fileText)
					except Exception as e :
						print (e)
					for resultTuple in result :
						if resultTuple[1] in self.KKMA_TAG_DICT  :
							comFile.write('%s : %s [%s]' % (resultTuple[0], resultTuple[1], self.KKMA_TAG_DICT[resultTuple[1]])  + '\n')
						else :
							comFile.write('%s : %s [%s]' % (resultTuple[0], resultTuple[1], 'UnKonwn')  + '\n')
				else :
					continue
Ejemplo n.º 9
0
def fileToMat(filename, w2vec, maxLen, label_set, train=True):
    kkma = Kkma()
    train_f = open(filename, 'r', encoding='utf-8')
    mat = []
    line_num = 0
    for line in train_f.read().splitlines():
        sen = {}
        line_splitted = line.split('\t')
        sbj = line_splitted[0].strip()
        obj = line_splitted[1].strip()
        relation = line_splitted[2].strip()
        sentence = line_splitted[3].strip()
        sentence_complete = re.sub('<< _obj_ >>', obj,
                                   re.sub('<< _sbj_ >>', sbj, sentence))
        sentence_complete = utils.clean_str(sentence_complete)
        tokens = [
            p[0] + '/' + p[1] for p in kkma.pos(sentence_complete)
            if p[0] + '/' + p[1] in w2vec.vocab
        ]
        if maxLen < len(tokens):
            if train:
                maxLen = len(tokens)
            else:
                tokens = tokens[:maxLen]
        label_set.add(relation)
        sen['sbj'] = sbj
        sen['obj'] = obj
        sen['relation'] = relation
        sen['sentence'] = sentence
        sen['tokens'] = tokens
        mat.append(sen)
        line_num += 1
    train_f.close()
    return mat, label_set, maxLen, line_num
def extract_entity_relation_sent_ko(text):
    from konlpy.tag import Kkma
    kkma = Kkma()
    try:
        postags = kkma.pos(text)
    except Exception as e:
        print(e)
        return None

    head_entity = ''
    tail_entity = ''
    relation = ''
    head_type = 'KO'  #temporary to represent korean entity
    tail_type = 'KO'

    for tag in postags:
        if tag[1][0] == 'N' and head_entity == '':
            head_entity = tag[0]
        elif tag[1][0] == 'V' and relation == '':
            relation = tag[0]
        elif tag[1][0] == 'N' and tail_entity == '' and relation != '':
            tail_entity = tag[0]

    if head_entity != '' and tail_entity != '' and relation != '':
        return (head_entity, relation, tail_entity, head_type, tail_type)
    else:
        return None
Ejemplo n.º 11
0
def word_end_check(txt):

    kkma = Kkma()
    # 형태소 및 태그 추출
    pos = kkma.pos(txt)
    # 빈도 카운트 및 저장(dict)
    count = Counter(pos)

    word_a = 0
    word_b = 0
    for i in count.keys():
        # 의문, 청유형 횟수 카운트
        if i[1] in ('EFQ', 'EFA'):
            word_a += count[i]
        # 평서, 존칭형 횟수 카운트
        elif i[1] in ('EFN', 'EFR'):
            word_b += count[i]

    # print(word_a)
    # print(word_b)
    # 참여유도형 화법 비율
    rate1 = word_a / (word_a + word_b) * 100
    # 공식적인 화법 비율
    rate2 = word_b / (word_a + word_b) * 100

    return {'formal_speak': round(rate2, 2), 'question_speak': round(rate1, 2)}
def count_seen_geadwords(lines, predicate="VV", headword='NNG'):
    """
    코퍼스를 받고 문장 내에서 술어(동사, W)와 표제어(명사, NNG)를 찾
    Seen_R(w) 함수를 구성
    """

    tagger = Kkma()
    seen_dict = {}

    for line in lines:
        pos_result = tagger.pos(line)

        word_h, word_p = None, None
        for word, pos in pos_result:
            if pos == predicate or pos[:3] == predicate + '+':
                word_p = word
                break

            if pos == headword:
                word_h = word

            if word_h is not None and word_p is not None:
                seen_dict[word_p] = [word_h] + (
                    [] if seen_dict.get(word_p) is None else seen_dict[word_p])

        return seen_dict
Ejemplo n.º 13
0
def get_nouns(text, isPositive, keyword):
    positive_nouns = []
    negative_nouns = []
    nouns = []

    spliter = Kkma()
    isnouns = ['NNG', 'NNP']
    tags = spliter.pos(text)
    # 긍정일 때
    if isPositive == 1:
        for i in tags:
            if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword:
                nouns.append(i[0])
                positive_nouns.append(i[0])
    # 부정일 때
    elif isPositive == -1:
        for i in tags:
            if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword:
                nouns.append(i[0])
                negative_nouns.append(i[0])
    else:
        for i in tags:
            if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword:
                nouns.append(i[0])

    return nouns, positive_nouns, negative_nouns
Ejemplo n.º 14
0
def input_to_morphemes(sentence) :
	kkma = Kkma()
	# 불용어
	except_word_list = []
	# 허용 형태소 타입 >> VA 형용사 /  VV 동사 / OL 외국어 / OH 한자 / 명사 추정 범주 / AUTOPLUS
	types = ['NNG', 'NNP', 'NNB', 'NNM', 'NP', 'VA', 'UN', 'AP']
	# 특수문자 제거
	sentence = remove_sc(str(sentence))
	# 어절 분리 > list
	origin_word_list = list(dict.fromkeys(regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{sentence}')))
	print(origin_word_list)
	results = []
	morphemes = []
	# 형태소 분석 단어 추출
	for origin_word in origin_word_list :
		if (origin_word not in except_word_list) : 
			for morpheme in kkma.pos(origin_word) :
				in_result = []	
				in_result.append(origin_word)
				in_result.append(morpheme)
				results.append(in_result)
				# 허용된 타입 & 2글자 이상의 단어만 추출
				if (morpheme[1] in types) and (len(morpheme[0]) > 1): 
					morphemes.append(morpheme)
	print(morphemes)
	return morphemes
Ejemplo n.º 15
0
def get_morphemes(news_title_list):
    kkma = Kkma()
    # print(len(chat_list))
    except_word_list = []
    except_keyword_list = []
    in_result_data = []

    print(f'{len(news_title_list)}개의 뉴스 기사 제목 형태소 분석 시작!')
    for idx, news_title in enumerate(news_title_list):
        news_title = regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+',
                                   f'{news_title}')
        for word in news_title:
            in_result_word = []
            group = []
            if (word not in except_word_list):
                word_g = []
                word_g.append(word)
                group.append(word_g)
                for keyword in kkma.pos(word):
                    if (keyword not in except_keyword_list):
                        in_result_word.append(keyword)
                group.append(in_result_word)
                in_result_data.append(group)
        print(f'[{idx} // {len(news_title_list)}] 데이터 가공 완료')
    return in_result_data
Ejemplo n.º 16
0
def kkmaFreqToDataFrame(text):
    kkma = Kkma()

    # 문서에서 명사 분류
    nouns = list()
    # kkma.nouns(text)라는 함수가 있지만 결과가 좋지 못하므로 연산조건을 달리함
    pos = kkma.pos(text)
    for keyword, type in pos:
        if type == "NNG" or type == "NNP":
            nouns.append(keyword)

    # 각 명사의 숫자 세기
    count = Counter(nouns)
    tag_count = []
    tags = []

    # 가장 많이 카운팅된 명사를 차례로 tags와 tag_count리스트에 추가
    for n, c in count.most_common(100):
        dics = {'tag': n, 'count': c}
        # 글자 수 조건 2~49자
        if len(dics['tag']) >= 2 and len(tags) <= 49:
            tag_count.append(dics['count'])
            tags.append(dics['tag'])

    # 어떤 텍스트가 형태소 분리됬는지 디버깅
    joined_text = " ".join(tags)
    print("형태소 : ", joined_text)
    print("언어 감지됨 : ", detect_langs(joined_text))

    # 본문 토큰화 시킨 것을 데이터프레임으로 변환
    return pd.DataFrame({"Word": list(tags), "Frequency": list(tag_count)})
Ejemplo n.º 17
0
def build_vocab(example_dict_pkl, word_threshold=5):
    tokenizer = Kkma()
    word_counter, verb_counter = Counter(), Counter()
    example_dict = dict()
    with open(example_dict_pkl, 'rb') as f:
        example_dict = pickle.load(f)

    for word, example in example_dict.items():
        sentence = example[0] + word + example[1]
        verb_counter[word] += 1
        for x in tokenizer.pos(sentence):
            if x[0] in example_dict.keys():
                verb_counter[x[0]] += 1
            else:
                word_counter[x[0]] += 1

    # build word vocabulary
    word_vocab = [PAD] + [word for word, count in word_counter.most_common() if count >= word_threshold] + [UNK]
    verb_vocab = [word for word, count in verb_counter.most_common()]

    # save to file
    with open("vocabulary/word_vocab.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(word_vocab))
    with open("vocabulary/verb_vocab.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(verb_vocab))
    with open("vocabulary/verb_count.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(["{}\t{}".format(word, count) for word, count in verb_counter.most_common()]))
Ejemplo n.º 18
0
def tag_pos(sentences, tagger='kkma'):
    """
    Predict Part-of-Speech tag of input sentences
    PoS tagger: KKMA
    :param sentences: list of input sentences
    :return: tagged sentences
    """
    if tagger == 'kkma':
        kkma = Kkma()
    elif tagger == 'mecab':
        mecab = Mecab()

    morph_lists = []
    for sent in sentences:
        morph_list = []
        if tagger == 'kkma':
            pos_tagged_sentences = kkma.pos(sent)
        elif tagger == 'mecab':
            pos_tagged_sentences = mecab.pos(sent)

        for (key, value) in pos_tagged_sentences:
            value = transform_pos(value, tagger)
            morph_list.append([key, value])
        morph_lists.append(morph_list)

    return morph_lists
Ejemplo n.º 19
0
    def showmorp(self):
        okja = self.text

        # okja = []
        # for line in self.text:
        #     okja.append(line)

        sentences_tag = []
        count = 0

        f = open('result.txt', 'w', encoding='utf-8')
        # for sentence in okja:

        print("분석 문장: ", okja)

        morph = Kkma.pos(okja)
        print(morph)
            # count = count + 1
            # print(count, "번째 문장 분석결과 : ", morph)
            # print("\n")

            # f.writelines(str(morph))
            # f.writelines("\n")

        return morph
Ejemplo n.º 20
0
    def parsing(self):
        kkma = Kkma()
        tree = kkma.pos(self.text)
        '''
        grammar = """
        NP: {<N.*>*<Suffix>?}   # Noun phrase
        VP: {<V.*>*}            # Verb phrase
        AP: {<A.*>*}            # Adjective phrase
        """
        parser = nltk.RegexpParser(grammar)
        chunks = parser.parse(tree)
        chunks.draw()
        '''
        print(tree)
        # translated_seq ,check_seq= self.Translate(tree)
        # composed_seq = self.Compose_unit(translated_seq)
        '''
        print(translated_seq)
        print(check_seq)
        print(composed_seq)
        '''
        for k in range(len(tree)):
            if tree[k][1][0] == 'J' or tree[k][1][0] == 'E' or tree[k][1][0] == 'M' :
                print(tree[k],end=" ")

        # post_processed_seq = self.Comp_OP_post_processing(composed_seq)
        '''
        for j in range(len(translated_seq)):
            print(translated_seq[j],end=" ")
        print("\n")
        '''

        # template_number = self.WhatTemplate(post_processed_seq)
        #print(template_number)
        '''
Ejemplo n.º 21
0
def parse(file):
    result_list = []
    kkma = Kkma()                                       # Konlpy의 꼬꼬마 형태소 분석기 사용

    text_file = codecs.open(                            # 텍스트 파일을 UTF-8 인코딩으로 로드
        folder_name + '/' + file,
        'r',
        encoding='utf8'
    )
    file_sentence_list = text_file.readlines()          # 파일 내 텍스트를 row단위로 읽어들임

    for sentence in file_sentence_list:                 # 개별 Row 별로 작업
        sentence_word_list = []                         # row별 리스트 생성

        morphemes = kkma.pos(sentence)                  # row에서 단어와 품사를 추출
        for word_set in morphemes:
            word = word_set[0]                          # 단어와 품사를 각각 분리
            type = word_set[1]

            if type in tag_list:                        # 지정된 품사를 갖는 단어만 추출
                if word in word_list:                   # 지정된 단어만 추출
                    if type == 'VV' or type == 'VA':    # 동사인 경우 뒤에 '다'를 붙여줌
                        word += '다'
                    word = replace_list.get(word, word) # 특정 조건에 해당하는 단어의 경우 치환해줌
                    sentence_word_list.append(word)     # 단어를 row별 리스트에 저장

        result_list.append(sentence_word_list)          # 결과값에 row별 리스트들을 더해줌

    return result_list                                  # 결과값 반환
Ejemplo n.º 22
0
def kkma_analysis_at_once_for_itdaily(corpus_path, raw_and_tagged_directory, tagged_only_directory):
    kkma = Kkma()
    list_of_files = os.listdir(corpus_path)
    for entry in sorted(list_of_files):
        number = entry.split("-")[0]
        if 800 < int(number) < 1901:
            if fnmatch.fnmatch(entry, "*.txt"):
                list_of_lines = get_lines_utf8(corpus_path + "/" + entry)
                raw_and_tagged_path = raw_and_tagged_directory + "/" + number + ".txt"
                tagged_only_path = tagged_only_directory + "/" + number + ".txt"
                raw_and_tagged_file = open(raw_and_tagged_path, "a", encoding="utf-8")
                tagged_only_file = open(tagged_only_path, "a", encoding="utf-8")
                for line in list_of_lines:
                    if len(line) > 0:
                        raw_and_tagged_file.write(line + "\t")
                        try:
                            analyzed_morphemes = kkma.pos(line)
                            for idx, pair in enumerate(analyzed_morphemes):
                                if idx == len(analyzed_morphemes) - 1:
                                    raw_and_tagged_file.write(pair[0] + "/" + pair[1])
                                    tagged_only_file.write(pair[0] + "/" + pair[1])
                                else:
                                    raw_and_tagged_file.write(pair[0] + "/" + pair[1] + " ")
                                    tagged_only_file.write(pair[0] + "/" + pair[1] + " ")
                            raw_and_tagged_file.write("\n")
                            tagged_only_file.write("\n")
                        except:
                            continue
                raw_and_tagged_file.close()
                tagged_only_file.close()
Ejemplo n.º 23
0
 def kkma_pos(self):
     kkma=Kkma()
     try :
         self.tree = kkma.pos(self.text)
     except:
         if self.tree == None :
             self.tree = []
Ejemplo n.º 24
0
def konlpy():

    # value = request.form['konlpy_tag']
    value = request.args.get('search')

    kkma = Kkma()
    
    a = kkma.pos(value)
    noun = kkma.nouns(value)
    
    word = []
    pos = []

    for i in a:
        # print(i[0] + ',' + i[1])
        word.append(i[0])
        pos.append(i[1])

    print(noun)
    print(word)
    print(pos)
   
    result =  {'word': word , 'pos': pos , 'noun': noun }
    print(result)

    return result
Ejemplo n.º 25
0
def analyseText(text_data, results_file_name="results.txt"):
    print("\n---------------------------------------------")
    print("Step2 : 단어별 형태소 및 빈도를 분석합니다... 기다려 주세요")
    kkma = Kkma()
    data_pos = kkma.pos(text_data)
    data_arr = []
    print("명사만 필터링하는 중...")
    for word_pos in data_pos:
        word = word_pos[0]
        pos = word_pos[1]
        if pos == "NNG":  #명사만 필터링함. 동사도 포함하려면 or pos=="VA" 추가할 것
            data_arr.append(word)

    print("단어별 발생빈도를 정렬하고 파일에 저장하는 중...")
    counter = Counter(data_arr).most_common()
    keywords_and_frequency = {}
    results_file = open(results_file_name, "w", encoding="utf-8")

    print("한 글자 이상 단어, 빈도수 2 이상인 것만 필터링하는 중...")
    for keyword in counter:
        word = keyword[0]
        freq = keyword[1]
        if len(word) > 1 and freq > 2:  #한 글자 이상 단어 + 빈도수가 2 이상인 것만 추출
            keywords_and_frequency[word] = freq
            this_text = word + " : " + str(freq) + "건\n"
            results_file.write(this_text)

    results_file.close()
    print("형태소 및 빈도 분석 완료!")
    return keywords_and_frequency
def parse(file):
    result_list = []
    kkma = Kkma()  # Konlpy의 꼬꼬마 형태소 분석기 사용
    file = open(folder_name + '/' + file)  # 인자로 전달받은 파일 열기
    file_text = file.read()  # 파일의 텍스트를 읽어옴
    sentences = kkma.sentences(file_text)  # 형태소 분석기로 텍스트 내의 문장을 리스트로 추출

    for sentence in sentences:  # 개별 문장 단위로 작업
        sentense_list = []  # 문장별 단어 저장할 리스트 생성

        morphemes = kkma.pos(sentence)  # 문장 내의 형태소를 품사와 함께 추출 (단어, 품사)
        for word_set in morphemes:
            word = word_set[0]  # 단어와 품사로 분리
            type = word_set[1]
            if type in tag_list:  # 지정된 품사를 갖는 단어만 추출
                if word in word_list:  # 지정된 단어만 추출
                    if type == 'VV' or type == 'VA':  # 동사인 경우 뒤에 '다'를 붙여줌
                        word += '다'
                    word = replace_list.get(word,
                                            word)  # 특정 조건에 해당하는 단어의 경우 치환해줌
                    sentense_list.append(word)  # 단어를 문장별 리스트에 저장

        result_list.append(sentense_list)  # 결과값에 문장별 리스트들을 더해줌

    return result_list  # 결과값 반환
Ejemplo n.º 27
0
def making_morpho(inputstr='', filter=[u'NNG', u'NNP', u'NNB']):
    """함수 선언 방법 
    0. 파라미터 : inputstr 문자열, filter 형태소 부호
    1. 목적 : 문자열 입력받아 형태소분석하여 문자열 리턴해줌. 문자열 튜플로 리턴되는데, 
            (1) 형태소분석결과 전체, (2) 필터에 해당하는 단어 
    2. 원본 : textanalysis.py
    3. 수정 : 2016.11.30
    4. 문제 :  
    5. 예제 : 
    
    txt = u"둘째, 나노기술을 체계적으로 발전시켜 나가겠습니다. \
            세계의 기술전쟁은 원자와 분자 수준으로 진입했습니다. \
            사람의 눈으로 볼 수 없는 극미세 물질의 세계로 들어섰습니다. \
            올해 내에 나노기술 종합 발전계획을 발표하겠습니다."

    a,b = making_mopho(txt)
    print a
    print "*"*30
    print b
    
    """
    kkma = Kkma()
    morpho = kkma.pos(inputstr)
    morpho_str = ''
    morpho2 = ''
    for each in morpho:
        morpho_str = morpho_str + '|' + each[0] + '-' + each[1]
        if each[1] in filter:
            morpho2 = morpho2 + '|' + each[0]

    return morpho_str, morpho2
Ejemplo n.º 28
0
def tagPOS(filename):
    try:
        # Read file
        f = open(filename, 'r')
        text = f.read().decode('utf-8')  # read file as utf8 decoded
        f.close()

        # tagging
        from konlpy.tag import Kkma
        #from konlpy.utils import pprint
        kkma = Kkma()
        print('now tagging...')
        tagged = kkma.pos(text)

        # Write tagged file
        (path, fnameExt) = os.path.split(filename)
        (fname, fext) = os.path.splitext(fnameExt)
        tagged_file = fname + '_' + 'tagged' + fext
        fw = open(tagged_file, 'w')
        for line in tagged:
            strs = "\t".join(x for x in line).encode('utf-8')
            fw.write(strs + "\n")
        fw.close()
        print '%s is created' % (tagged_file)
    except:
        print '\nERROR:'
        print '"%s" is not a valid text\nCheck your text file\nor file path\n' % (
            filename)
        sys.exit(1)
Ejemplo n.º 29
0
def get_noun(msg_txt):
    kkma = Kkma()
    nouns = list()
    try:
        #한글만 뽑아내기
        hanguleng = re.compile('[^ ㄱ-ㅣ가-힣]+')
        msg_txt = hanguleng.sub('', msg_txt)
        # ㅋㅋ, ㅠㅠ, ㅎㅎ 등등 필터링
        pattern = re.compile("[ㄱ-ㅎㅏ-ㅣ]+")
        msg_txt = re.sub(pattern, '', msg_txt).strip()
    except:
        print('msg_txt strip error')
    try:
        #print(msg_txt)
        if len(msg_txt) > 0:
            pos = kkma.pos(msg_txt)
            for keyword, type in pos:
                # 고유명사 또는 보통명사
                if type == "NNG" or type == "NNP":
                    nouns.append(keyword)
            #print(msg_txt, "->", nouns)
    except:
        print("get_noun Error!")
        print(msg_txt)

    return nouns
Ejemplo n.º 30
0
def get_keywords(chat_list):
    kkma = Kkma()
    # print(len(chat_list))
    except_word_list = []
    except_keyword_list = []
    in_result_data = []

    print('형태소 분석 시작!')
    for idx in range(len(chat_list)):
        # if idx < 10 :
        # print(chat_list[idx].get('message'))
        replace_chat_message = re.sub(
            '\,', '&#44;',
            re.sub('[\"\'‘“”″′]', '&#8220;',
                   str(chat_list[idx].get('message'))))
        result_message = regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+',
                                       f'{replace_chat_message}')

        for word in result_message:
            in_result_word = []
            group = []
            if (word not in except_word_list):
                word_g = []
                word_g.append(word)
                group.append(word_g)
                for keyword in kkma.pos(word):
                    if (keyword not in except_keyword_list):
                        in_result_word.append(keyword)
                group.append(in_result_word)
                in_result_data.append(group)
        # print(f'[{idx} // {len(chat_list)}] 데이터 가공 완료')
    return in_result_data
Ejemplo n.º 31
0
def get_near_keyword(q_list, cursor):
    kkma = Kkma()
    # 제외할 단어 목록
    global except_word_list
    fin_result = []

    for idx, q in enumerate(q_list):
        question = [q[0], remove_html(q[1])]
        sentence = re.sub('[-=.#/?:$}\"\']', '',
                          str(question[1])).replace('[', '').replace(']', '')
        origin_word_list = list(
            dict.fromkeys(
                regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+',
                              f'{sentence}')))
        # 형태소 분석
        out_result = []
        out_result.append(question[0])
        result = []
        for origin_word in origin_word_list:
            if (origin_word not in except_word_list):
                for morpheme in kkma.pos(origin_word):
                    if (len(morpheme[0]) > 1) and (morpheme[1] == 'NNG'
                                                   or morpheme[1] == 'NNP'
                                                   or morpheme[1] == 'NNB'
                                                   or morpheme[1] == 'NNM'):
                        in_result = []
                        in_result.append(morpheme[0])

                        result.append(in_result)
        out_result.append(result)
        fin_result.append(out_result)

    for result in fin_result:
        qna_no = result[0]
        print(result)
        for morpheme in result[1]:
            morpheme_keyword = morpheme[0]
            # print(type(morpheme_keyword), morpheme_keyword)

            try:
                cursor.execute(f"""
					SELECT 
						TARGET_MORPHEME_WORD 
					FROM 
						TBL_CCQ_KEYWORD_MAP 
					WHERE 
						SOURCE_WORD = "{morpheme_keyword}"
					ORDER BY 
						WORD_DISTANCE DESC 
					LIMIT 1
				""")
                keyword = cursor.fetchall()
                if len(keyword) > 0:
                    print(f'"{morpheme_keyword}"과(와) 연관단어 > "{keyword[0][0]}"')
            except Exception as e:
                print(f'****** + error! >> {e} >>>>> SELECT 오류!')
                continue
            finally:
                pass
Ejemplo n.º 32
0
    def _kkma_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        kkma = Kkma()
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(kkma.pos(str(data)), tag_combine=tag_combine)
        return return_arr
Ejemplo n.º 33
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
Ejemplo n.º 34
0
def get_vs(line):
    korean_analyzer = Kkma()
    return [word for word, tag in korean_analyzer.pos(line) if tag in ['VA', 'VX', 'VC']]
Ejemplo n.º 35
0
from konlpy.tag import Kkma
from konlpy.utils import pprint
from openpyxl import load_workbook, Workbook

dic={}
kkma=Kkma()
wb=load_workbook(filename='menulist.xlsx',read_only=True)
ws=wb['Sheet1']
for i in range(1,5897):
    for l,k in kkma.pos(ws['A'+str(i)].value):
        if l not in dic.keys():
            dic[l]=0
        else:
            dic[l]+=1

wb=Workbook()

dest_filename="determine.xlsx"

ws1=wb.active
ws1.title="result"
num=1
for l,k in dic.items():
    ws1['A'+str(num)]=l
    ws1['B'+str(num)]=k
    num+=1

wb.save(filename=dest_filename)
Ejemplo n.º 36
0
from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()

pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다'));
pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.'));
pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
Ejemplo n.º 37
0
#!/usr/bin/python
# vim: set fileencoding=utf8 :
from konlpy.tag import Kkma
from konlpy.utils import pprint
from convert import convert
import fileinput

kkma = Kkma()
pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.'))
#poss = kkma.pos(u'(오류보고는) "실행환경", 에러메세지와함께 설명을 최대한상세히!^^')


for line in fileinput.input():
    poss = kkma.pos(unicode(line, "utf-8"))
    for tup in poss:
        print tup[0],
        print convert(tup[1])
Ejemplo n.º 38
0
kkma = Kkma()

stopWord_Ingre = {"재료" , "계량법" , "안내" , "조금"}


# In[113]:


mystr = getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6879000_6880000")
mystr += getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6870000_6871000")


# In[ ]:


tokenized = kkma.pos(mystr)


# In[ ]:


token_filtered = list(filter(lambda mytoken: mytoken[1] == "NNG" or mytoken == "NNG" or mytoken == "NNB" and not mytoken[0] in stopWord_Ingre, tokenized))


# In[ ]:


embedding_model = Word2Vec(token_filtered , size=10, window = 3, min_count=0 , workers=3, iter=10, sg=1)


# In[ ]:
Ejemplo n.º 39
0
    l = [k.pos(lines[i]) for i in range(start, end)]
    result.append(l)
    return

if __name__=="__main__":
    import time

    print('Number of lines in document:')
    k = Kkma()
    lines = kolaw.open('constitution.txt').read().splitlines()
    nlines = len(lines)
    print(nlines)

    print('Batch tagging:')
    s = time.clock()
    result = []
    l = [k.pos(line) for line in lines]
    result.append(l)
    t = time.clock()
    print(t - s)

    print('Concurrent tagging:')
    result = []
    t1 = Thread(target=do_concurrent_tagging, args=(0, int(nlines/2), lines, result))
    t2 = Thread(target=do_concurrent_tagging, args=(int(nlines/2), nlines, lines, result))
    t1.start(); t2.start()
    t1.join(); t2.join()

    m = sum(result, []) # Merge results
    print(time.clock() - t)
Ejemplo n.º 40
0
__author__ = 'woojin'
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()
pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.'))
pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.'))
pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
Ejemplo n.º 41
0
def Training():
    for article in article_list:
        # print(article)
        # title = article[0]
        # link = article[1]
        # newspaper = article[2]
        kkma = Kkma()

        try:
            content, issueDateTime = NateNews.get_content(article['link'])
            issueDateTime = pd.to_datetime(issueDateTime)
            # issueDate = time.strftime('%Y-%m-%d', issueDateTime)
            # issueTime = time.strftime('%H:%M:%S', issueDateTime)
            issueDate = issueDateTime.date()
            issueTime = issueDateTime.time()

            # 형태소 분석
            # wordList = kkma.pos(content)

            # [보통명사 동사 형용사 보조동사 명사추정범주] 필터링


            # print(title)
            # print('wordList : ', wordList)
            # print(issueDateTime)
            # print(link)
            # print(newspaper)
            # print(issueDate)
            # print('wordList : ', wordList)
            wordList = list(getWords(kkma.pos(content)))

            ws = set(wordList)
            print('ws : ', ws)
            dic = {}
            for word in ws:
                print('word : ', word)
                dic.update({word: wordList.count(word)})

            print('dic : ', dic)
            n = 10
            listdic = sorted(dic.items(), key=operator.itemgetter(1), reverse=True)[:n]
            print('listdic : ', listdic)

            for l in listdic:
                print('l : ', l)
                wordList.append(l[0])

            baseDate = ''
            if issueTime > pd.to_datetime('15:30:00').time():
                # print('장 마감 이후')
                baseDate = stockDF[stockDF['datetime'] > issueDate].head(1)['datetime']
            else:
                # print('장 마감 이전')
                baseDate = stockDF[stockDF['datetime'] >= issueDate].head(1)['datetime']
            print('issueTime : ', issueTime)
            print('baseDate : ', baseDate)
            # print(type(baseDate))
            if issueDate > pd.to_datetime(testSetFromDate).date() or len(baseDate) == 0:
                # test set
                testEntry.append({'issueDateTime': issueDateTime, 'wordList': wordList})
            else:
                # trainning set
                baseDate = pd.Series(baseDate).values[0]
                # print('해당 일자 주식 확인 : ', baseDate)
                trainingSet.append({'issueDateTime': issueDateTime, 'wordList': wordList})
                print(trainingSet)
                # print(int(stockDF[stockDF['날짜'] == baseDate]['종가']))
                # print(int(stockDF[stockDF['날짜'] < baseDate].tail(1)['종가']))

                todayPrice = int(stockDF[stockDF['datetime'] == baseDate]['close'])
                prevPrice = int(stockDF[stockDF['datetime'] < baseDate].tail(1)['close'])
                if (todayPrice > prevPrice):
                    # print(baseDate, ' : up')
                    classList.append(1)
                else:
                    if (todayPrice < prevPrice):
                        # print(baseDate, ' : down')
                        classList.append(0)
                    else:
                        # print(baseDate, ' : hold')
                        classList.append(0)
        except:
            pass