class TagParser: def __init__(self): self.kkma = Kkma() self.model = fasttext._load_fasttext_format("../fasttext.bin") def parse_tag(self, sentence): ''' 문장에서 태그를 파싱한다 ''' posed_sentence = self.kkma.pos(sentence) # 문장을 형태소 단위로 쪼갠다 xr_list = [] nng_va_list = [] # VA + NNG nng_va_concat = [] # VA와 NNG가 서로 짝을 찾아 합쳐진 리스트 # XR(어근)만 찾아서 빼낸다 for index,i in enumerate(posed_sentence): if i[1]=='XR': xr_list.append(i[0]) # VA(형용사) 과 NNG(보통 명사) 만 빼내서 저장 for index,i in enumerate(posed_sentence): if i[1]=='NNG' or i[1]=='VA': nng_va_list.append(i) # VA 인덱스의 -1 번째 항이 NNG 일경우 합친것을 다른 배열에 저장. for index,i in enumerate(nng_va_list): if nng_va_list[index][1] == 'VA': if index == 0: continue elif nng_va_list[index-1][1]=='NNG': nng_va_concat.append(nng_va_list[index-1][0] + " " + nng_va_list[index][0]+"음") # 만약 그렇지 않을 경우 패스// EX) 넓음 # XR[] 과 VA[] 합치기 return xr_list + nng_va_concat def pos(self, sentence): ''' 문장을 형태소 별로 토큰화 ''' return self.kkma.pos(sentence) def most_sim(self, word): ''' 가장 비슷한 단어 추출 ''' return self.model.wv.most_similar(word) def wmd(self, tag1, tag2): ''' tag1와 tag2 사이의 거리를 리턴 ''' return self.model.wv.wmdistance(tag1, tag2)
class UsecaseFinder: def __init__(self): self.keyword = keyword self.kkma = Kkma() self.kkma.pos(u'성능을 위한 더미 데이터') # this is dummy for performance self.usecase = [] self.parsingNVLIST = [] self.parsingLIST = [] self.dic = {} def getNV(self, sentence): self.parsingNVLIST = [] self.parsingLIST = [] data = self.kkma.pos(sentence) # print(data) for list in data: if 'N' in list[1] or list[1].count( 'V') > 1 or 'C' in list[1] or 'JKM' in list[1]: self.parsingNVLIST.append(list[0]) self.parsingLIST.append(list) print("\nkkma_LOG %s" % self.parsingNVLIST) def setUserSetting(self): self.setUsecase("feed", ["먹이", "사료", "먹을 것", "밥", "배식"], ["주", "줘", "급여", "배식", "먹"], util.GRAVITY_N) self.setUsecase("open", ["문", "입구"], ["열", "오픈", "개방"], util.GRAVITY_N) self.setUsecase("camera", ["사진", "상황", "모습", "얼굴", "현황"], ["보여주", "찍", "알리"], util.GRAVITY_ALL) self.setUsecase("regist", ["등록"], ["등록"], util.GRAVITY_ANY) self.setUsecase("howToUse", ["사용법", '도우미', "도움말"], ["사용법", '도우미', "도움말"], util.GRAVITY_ANY) def printList(self): for item in self.usecase: item._print() def analyzeSentence(self, sentence): request = [] self.getNV(sentence) for item in self.usecase: if item.isMe(self.parsingNVLIST, self.parsingLIST) == True: request.append(item.keyWord) return request def setUsecase(self, name, nList, vList, gravity): self.usecase.append(self.keyword.KeyWord(name)) keyword = self.usecase[len(self.usecase) - 1] keyword.setNouns(nList) keyword.setGravity(gravity) keyword.setGravity(gravity) keyword.setVerbs(vList)
def run_kkma(): kkma = Kkma() start_time = time.time() print('kkma 시작') kkma_morphs = kkma.morphs(news1) kkma_nouns = kkma.nouns(news1) kkma_pos = kkma.pos(news1) end_time = time.time() print('kkma 끝 - %s 초' % str(end_time - start_time)) kkma_sentences = kkma.sentences(news1) with open('kkma.txt', 'w', encoding='utf-8') as fstream: fstream.write('kkma time : %s s\n' % str(end_time - start_time)) fstream.write('kkma_morphs\n') write_list(kkma_morphs, fstream) fstream.write('\n\n') fstream.write('kkma_nouns\n') write_list(kkma_nouns, fstream) fstream.write('\n\n') fstream.write('kkma_pos\n') write_pos(kkma_pos, fstream) fstream.write('\n\n') fstream.write('kkma_sentences\n') write_list(kkma_sentences, fstream) fstream.write('\n')
def tagPOS(filename): try: # Read file f = open(filename, 'r') text = f.read().decode('utf-8') # read file as utf8 decoded f.close() # tagging from konlpy.tag import Kkma #from konlpy.utils import pprint kkma = Kkma() print ('now tagging...') tagged = kkma.pos(text) # Write tagged file (path,fnameExt) = os.path.split(filename) (fname,fext) = os.path.splitext(fnameExt) tagged_file = fname+'_'+'tagged'+fext fw = open(tagged_file,'w') for line in tagged: strs="\t".join(x for x in line).encode('utf-8') fw.write(strs+"\n") fw.close() print '%s is created' % (tagged_file) except: print '\nERROR:' print '"%s" is not a valid text\nCheck your text file\nor file path\n' % (filename) sys.exit(1)
def get_kkma_token(self): kkma = Kkma() self.kkmaTag = kkma.pos(self.doc) for i in self.kkmaTag: if i[1] == 'NNG': self.kkma_tokens.append(i[0])
def get_morphemes(post_detail_cont): kkma = Kkma() results = [] except_word_list = [] # 특수문자 제거 detail = remove_sc(str(post_detail_cont)) # 어절 분리 > list origin_word_list = list( dict.fromkeys( regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{detail}'))) if len(origin_word_list) > 100: del origin_word_list[100:len(origin_word_list)] # origin_word_list = ['안녕하세요', '문의사항이', '있습니다'] for origin_word in origin_word_list: if (origin_word not in except_word_list): for morpheme in kkma.pos(origin_word): in_result = [] in_result.append(origin_word) in_result.append(morpheme) results.append(in_result) # results = [['자동차', ('자동차', 'NNG')], ['고장', ('고장', 'NNG')], ['진단', ('진단', 'NNG')], ['APP', ('APP', 'OL')], ['개발', ('개발', 'NNG')], ['데이터', ('데이터', 'NNG')], ['수집', ('수집', 'NNG')]] return results
class konlpy_test: def __init__(self): self.kkma = Kkma() def split_sen(self, text): words = [x[0] for x in self.kkma.pos(text) if x[1] == 'NNG'] return words
def run(self) : print ('KonLpy Start') fileName = os.path.basename(self.path) compFilePath = os.path.join(self.FILE_PATH, fileName) fileText = '' fileTextList = '' kkma = Kkma() with open(self.path, 'r', encoding='utf-8') as readFile : fileTextList = readFile.readlines() with open(compFilePath, 'w', encoding='utf-8') as comFile : result = None for fileText in fileTextList : if not(fileText == None or fileText.strip() == '') : try : result = kkma.pos(fileText) except Exception as e : print (e) for resultTuple in result : if resultTuple[1] in self.KKMA_TAG_DICT : comFile.write('%s : %s [%s]' % (resultTuple[0], resultTuple[1], self.KKMA_TAG_DICT[resultTuple[1]]) + '\n') else : comFile.write('%s : %s [%s]' % (resultTuple[0], resultTuple[1], 'UnKonwn') + '\n') else : continue
def fileToMat(filename, w2vec, maxLen, label_set, train=True): kkma = Kkma() train_f = open(filename, 'r', encoding='utf-8') mat = [] line_num = 0 for line in train_f.read().splitlines(): sen = {} line_splitted = line.split('\t') sbj = line_splitted[0].strip() obj = line_splitted[1].strip() relation = line_splitted[2].strip() sentence = line_splitted[3].strip() sentence_complete = re.sub('<< _obj_ >>', obj, re.sub('<< _sbj_ >>', sbj, sentence)) sentence_complete = utils.clean_str(sentence_complete) tokens = [ p[0] + '/' + p[1] for p in kkma.pos(sentence_complete) if p[0] + '/' + p[1] in w2vec.vocab ] if maxLen < len(tokens): if train: maxLen = len(tokens) else: tokens = tokens[:maxLen] label_set.add(relation) sen['sbj'] = sbj sen['obj'] = obj sen['relation'] = relation sen['sentence'] = sentence sen['tokens'] = tokens mat.append(sen) line_num += 1 train_f.close() return mat, label_set, maxLen, line_num
def extract_entity_relation_sent_ko(text): from konlpy.tag import Kkma kkma = Kkma() try: postags = kkma.pos(text) except Exception as e: print(e) return None head_entity = '' tail_entity = '' relation = '' head_type = 'KO' #temporary to represent korean entity tail_type = 'KO' for tag in postags: if tag[1][0] == 'N' and head_entity == '': head_entity = tag[0] elif tag[1][0] == 'V' and relation == '': relation = tag[0] elif tag[1][0] == 'N' and tail_entity == '' and relation != '': tail_entity = tag[0] if head_entity != '' and tail_entity != '' and relation != '': return (head_entity, relation, tail_entity, head_type, tail_type) else: return None
def word_end_check(txt): kkma = Kkma() # 형태소 및 태그 추출 pos = kkma.pos(txt) # 빈도 카운트 및 저장(dict) count = Counter(pos) word_a = 0 word_b = 0 for i in count.keys(): # 의문, 청유형 횟수 카운트 if i[1] in ('EFQ', 'EFA'): word_a += count[i] # 평서, 존칭형 횟수 카운트 elif i[1] in ('EFN', 'EFR'): word_b += count[i] # print(word_a) # print(word_b) # 참여유도형 화법 비율 rate1 = word_a / (word_a + word_b) * 100 # 공식적인 화법 비율 rate2 = word_b / (word_a + word_b) * 100 return {'formal_speak': round(rate2, 2), 'question_speak': round(rate1, 2)}
def count_seen_geadwords(lines, predicate="VV", headword='NNG'): """ 코퍼스를 받고 문장 내에서 술어(동사, W)와 표제어(명사, NNG)를 찾 Seen_R(w) 함수를 구성 """ tagger = Kkma() seen_dict = {} for line in lines: pos_result = tagger.pos(line) word_h, word_p = None, None for word, pos in pos_result: if pos == predicate or pos[:3] == predicate + '+': word_p = word break if pos == headword: word_h = word if word_h is not None and word_p is not None: seen_dict[word_p] = [word_h] + ( [] if seen_dict.get(word_p) is None else seen_dict[word_p]) return seen_dict
def get_nouns(text, isPositive, keyword): positive_nouns = [] negative_nouns = [] nouns = [] spliter = Kkma() isnouns = ['NNG', 'NNP'] tags = spliter.pos(text) # 긍정일 때 if isPositive == 1: for i in tags: if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword: nouns.append(i[0]) positive_nouns.append(i[0]) # 부정일 때 elif isPositive == -1: for i in tags: if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword: nouns.append(i[0]) negative_nouns.append(i[0]) else: for i in tags: if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword: nouns.append(i[0]) return nouns, positive_nouns, negative_nouns
def input_to_morphemes(sentence) : kkma = Kkma() # 불용어 except_word_list = [] # 허용 형태소 타입 >> VA 형용사 / VV 동사 / OL 외국어 / OH 한자 / 명사 추정 범주 / AUTOPLUS types = ['NNG', 'NNP', 'NNB', 'NNM', 'NP', 'VA', 'UN', 'AP'] # 특수문자 제거 sentence = remove_sc(str(sentence)) # 어절 분리 > list origin_word_list = list(dict.fromkeys(regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{sentence}'))) print(origin_word_list) results = [] morphemes = [] # 형태소 분석 단어 추출 for origin_word in origin_word_list : if (origin_word not in except_word_list) : for morpheme in kkma.pos(origin_word) : in_result = [] in_result.append(origin_word) in_result.append(morpheme) results.append(in_result) # 허용된 타입 & 2글자 이상의 단어만 추출 if (morpheme[1] in types) and (len(morpheme[0]) > 1): morphemes.append(morpheme) print(morphemes) return morphemes
def get_morphemes(news_title_list): kkma = Kkma() # print(len(chat_list)) except_word_list = [] except_keyword_list = [] in_result_data = [] print(f'{len(news_title_list)}개의 뉴스 기사 제목 형태소 분석 시작!') for idx, news_title in enumerate(news_title_list): news_title = regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{news_title}') for word in news_title: in_result_word = [] group = [] if (word not in except_word_list): word_g = [] word_g.append(word) group.append(word_g) for keyword in kkma.pos(word): if (keyword not in except_keyword_list): in_result_word.append(keyword) group.append(in_result_word) in_result_data.append(group) print(f'[{idx} // {len(news_title_list)}] 데이터 가공 완료') return in_result_data
def kkmaFreqToDataFrame(text): kkma = Kkma() # 문서에서 명사 분류 nouns = list() # kkma.nouns(text)라는 함수가 있지만 결과가 좋지 못하므로 연산조건을 달리함 pos = kkma.pos(text) for keyword, type in pos: if type == "NNG" or type == "NNP": nouns.append(keyword) # 각 명사의 숫자 세기 count = Counter(nouns) tag_count = [] tags = [] # 가장 많이 카운팅된 명사를 차례로 tags와 tag_count리스트에 추가 for n, c in count.most_common(100): dics = {'tag': n, 'count': c} # 글자 수 조건 2~49자 if len(dics['tag']) >= 2 and len(tags) <= 49: tag_count.append(dics['count']) tags.append(dics['tag']) # 어떤 텍스트가 형태소 분리됬는지 디버깅 joined_text = " ".join(tags) print("형태소 : ", joined_text) print("언어 감지됨 : ", detect_langs(joined_text)) # 본문 토큰화 시킨 것을 데이터프레임으로 변환 return pd.DataFrame({"Word": list(tags), "Frequency": list(tag_count)})
def build_vocab(example_dict_pkl, word_threshold=5): tokenizer = Kkma() word_counter, verb_counter = Counter(), Counter() example_dict = dict() with open(example_dict_pkl, 'rb') as f: example_dict = pickle.load(f) for word, example in example_dict.items(): sentence = example[0] + word + example[1] verb_counter[word] += 1 for x in tokenizer.pos(sentence): if x[0] in example_dict.keys(): verb_counter[x[0]] += 1 else: word_counter[x[0]] += 1 # build word vocabulary word_vocab = [PAD] + [word for word, count in word_counter.most_common() if count >= word_threshold] + [UNK] verb_vocab = [word for word, count in verb_counter.most_common()] # save to file with open("vocabulary/word_vocab.txt", "w", encoding="utf-8") as f: f.write("\n".join(word_vocab)) with open("vocabulary/verb_vocab.txt", "w", encoding="utf-8") as f: f.write("\n".join(verb_vocab)) with open("vocabulary/verb_count.txt", "w", encoding="utf-8") as f: f.write("\n".join(["{}\t{}".format(word, count) for word, count in verb_counter.most_common()]))
def tag_pos(sentences, tagger='kkma'): """ Predict Part-of-Speech tag of input sentences PoS tagger: KKMA :param sentences: list of input sentences :return: tagged sentences """ if tagger == 'kkma': kkma = Kkma() elif tagger == 'mecab': mecab = Mecab() morph_lists = [] for sent in sentences: morph_list = [] if tagger == 'kkma': pos_tagged_sentences = kkma.pos(sent) elif tagger == 'mecab': pos_tagged_sentences = mecab.pos(sent) for (key, value) in pos_tagged_sentences: value = transform_pos(value, tagger) morph_list.append([key, value]) morph_lists.append(morph_list) return morph_lists
def showmorp(self): okja = self.text # okja = [] # for line in self.text: # okja.append(line) sentences_tag = [] count = 0 f = open('result.txt', 'w', encoding='utf-8') # for sentence in okja: print("분석 문장: ", okja) morph = Kkma.pos(okja) print(morph) # count = count + 1 # print(count, "번째 문장 분석결과 : ", morph) # print("\n") # f.writelines(str(morph)) # f.writelines("\n") return morph
def parsing(self): kkma = Kkma() tree = kkma.pos(self.text) ''' grammar = """ NP: {<N.*>*<Suffix>?} # Noun phrase VP: {<V.*>*} # Verb phrase AP: {<A.*>*} # Adjective phrase """ parser = nltk.RegexpParser(grammar) chunks = parser.parse(tree) chunks.draw() ''' print(tree) # translated_seq ,check_seq= self.Translate(tree) # composed_seq = self.Compose_unit(translated_seq) ''' print(translated_seq) print(check_seq) print(composed_seq) ''' for k in range(len(tree)): if tree[k][1][0] == 'J' or tree[k][1][0] == 'E' or tree[k][1][0] == 'M' : print(tree[k],end=" ") # post_processed_seq = self.Comp_OP_post_processing(composed_seq) ''' for j in range(len(translated_seq)): print(translated_seq[j],end=" ") print("\n") ''' # template_number = self.WhatTemplate(post_processed_seq) #print(template_number) '''
def parse(file): result_list = [] kkma = Kkma() # Konlpy의 꼬꼬마 형태소 분석기 사용 text_file = codecs.open( # 텍스트 파일을 UTF-8 인코딩으로 로드 folder_name + '/' + file, 'r', encoding='utf8' ) file_sentence_list = text_file.readlines() # 파일 내 텍스트를 row단위로 읽어들임 for sentence in file_sentence_list: # 개별 Row 별로 작업 sentence_word_list = [] # row별 리스트 생성 morphemes = kkma.pos(sentence) # row에서 단어와 품사를 추출 for word_set in morphemes: word = word_set[0] # 단어와 품사를 각각 분리 type = word_set[1] if type in tag_list: # 지정된 품사를 갖는 단어만 추출 if word in word_list: # 지정된 단어만 추출 if type == 'VV' or type == 'VA': # 동사인 경우 뒤에 '다'를 붙여줌 word += '다' word = replace_list.get(word, word) # 특정 조건에 해당하는 단어의 경우 치환해줌 sentence_word_list.append(word) # 단어를 row별 리스트에 저장 result_list.append(sentence_word_list) # 결과값에 row별 리스트들을 더해줌 return result_list # 결과값 반환
def kkma_analysis_at_once_for_itdaily(corpus_path, raw_and_tagged_directory, tagged_only_directory): kkma = Kkma() list_of_files = os.listdir(corpus_path) for entry in sorted(list_of_files): number = entry.split("-")[0] if 800 < int(number) < 1901: if fnmatch.fnmatch(entry, "*.txt"): list_of_lines = get_lines_utf8(corpus_path + "/" + entry) raw_and_tagged_path = raw_and_tagged_directory + "/" + number + ".txt" tagged_only_path = tagged_only_directory + "/" + number + ".txt" raw_and_tagged_file = open(raw_and_tagged_path, "a", encoding="utf-8") tagged_only_file = open(tagged_only_path, "a", encoding="utf-8") for line in list_of_lines: if len(line) > 0: raw_and_tagged_file.write(line + "\t") try: analyzed_morphemes = kkma.pos(line) for idx, pair in enumerate(analyzed_morphemes): if idx == len(analyzed_morphemes) - 1: raw_and_tagged_file.write(pair[0] + "/" + pair[1]) tagged_only_file.write(pair[0] + "/" + pair[1]) else: raw_and_tagged_file.write(pair[0] + "/" + pair[1] + " ") tagged_only_file.write(pair[0] + "/" + pair[1] + " ") raw_and_tagged_file.write("\n") tagged_only_file.write("\n") except: continue raw_and_tagged_file.close() tagged_only_file.close()
def kkma_pos(self): kkma=Kkma() try : self.tree = kkma.pos(self.text) except: if self.tree == None : self.tree = []
def konlpy(): # value = request.form['konlpy_tag'] value = request.args.get('search') kkma = Kkma() a = kkma.pos(value) noun = kkma.nouns(value) word = [] pos = [] for i in a: # print(i[0] + ',' + i[1]) word.append(i[0]) pos.append(i[1]) print(noun) print(word) print(pos) result = {'word': word , 'pos': pos , 'noun': noun } print(result) return result
def analyseText(text_data, results_file_name="results.txt"): print("\n---------------------------------------------") print("Step2 : 단어별 형태소 및 빈도를 분석합니다... 기다려 주세요") kkma = Kkma() data_pos = kkma.pos(text_data) data_arr = [] print("명사만 필터링하는 중...") for word_pos in data_pos: word = word_pos[0] pos = word_pos[1] if pos == "NNG": #명사만 필터링함. 동사도 포함하려면 or pos=="VA" 추가할 것 data_arr.append(word) print("단어별 발생빈도를 정렬하고 파일에 저장하는 중...") counter = Counter(data_arr).most_common() keywords_and_frequency = {} results_file = open(results_file_name, "w", encoding="utf-8") print("한 글자 이상 단어, 빈도수 2 이상인 것만 필터링하는 중...") for keyword in counter: word = keyword[0] freq = keyword[1] if len(word) > 1 and freq > 2: #한 글자 이상 단어 + 빈도수가 2 이상인 것만 추출 keywords_and_frequency[word] = freq this_text = word + " : " + str(freq) + "건\n" results_file.write(this_text) results_file.close() print("형태소 및 빈도 분석 완료!") return keywords_and_frequency
def parse(file): result_list = [] kkma = Kkma() # Konlpy의 꼬꼬마 형태소 분석기 사용 file = open(folder_name + '/' + file) # 인자로 전달받은 파일 열기 file_text = file.read() # 파일의 텍스트를 읽어옴 sentences = kkma.sentences(file_text) # 형태소 분석기로 텍스트 내의 문장을 리스트로 추출 for sentence in sentences: # 개별 문장 단위로 작업 sentense_list = [] # 문장별 단어 저장할 리스트 생성 morphemes = kkma.pos(sentence) # 문장 내의 형태소를 품사와 함께 추출 (단어, 품사) for word_set in morphemes: word = word_set[0] # 단어와 품사로 분리 type = word_set[1] if type in tag_list: # 지정된 품사를 갖는 단어만 추출 if word in word_list: # 지정된 단어만 추출 if type == 'VV' or type == 'VA': # 동사인 경우 뒤에 '다'를 붙여줌 word += '다' word = replace_list.get(word, word) # 특정 조건에 해당하는 단어의 경우 치환해줌 sentense_list.append(word) # 단어를 문장별 리스트에 저장 result_list.append(sentense_list) # 결과값에 문장별 리스트들을 더해줌 return result_list # 결과값 반환
def making_morpho(inputstr='', filter=[u'NNG', u'NNP', u'NNB']): """함수 선언 방법 0. 파라미터 : inputstr 문자열, filter 형태소 부호 1. 목적 : 문자열 입력받아 형태소분석하여 문자열 리턴해줌. 문자열 튜플로 리턴되는데, (1) 형태소분석결과 전체, (2) 필터에 해당하는 단어 2. 원본 : textanalysis.py 3. 수정 : 2016.11.30 4. 문제 : 5. 예제 : txt = u"둘째, 나노기술을 체계적으로 발전시켜 나가겠습니다. \ 세계의 기술전쟁은 원자와 분자 수준으로 진입했습니다. \ 사람의 눈으로 볼 수 없는 극미세 물질의 세계로 들어섰습니다. \ 올해 내에 나노기술 종합 발전계획을 발표하겠습니다." a,b = making_mopho(txt) print a print "*"*30 print b """ kkma = Kkma() morpho = kkma.pos(inputstr) morpho_str = '' morpho2 = '' for each in morpho: morpho_str = morpho_str + '|' + each[0] + '-' + each[1] if each[1] in filter: morpho2 = morpho2 + '|' + each[0] return morpho_str, morpho2
def tagPOS(filename): try: # Read file f = open(filename, 'r') text = f.read().decode('utf-8') # read file as utf8 decoded f.close() # tagging from konlpy.tag import Kkma #from konlpy.utils import pprint kkma = Kkma() print('now tagging...') tagged = kkma.pos(text) # Write tagged file (path, fnameExt) = os.path.split(filename) (fname, fext) = os.path.splitext(fnameExt) tagged_file = fname + '_' + 'tagged' + fext fw = open(tagged_file, 'w') for line in tagged: strs = "\t".join(x for x in line).encode('utf-8') fw.write(strs + "\n") fw.close() print '%s is created' % (tagged_file) except: print '\nERROR:' print '"%s" is not a valid text\nCheck your text file\nor file path\n' % ( filename) sys.exit(1)
def get_noun(msg_txt): kkma = Kkma() nouns = list() try: #한글만 뽑아내기 hanguleng = re.compile('[^ ㄱ-ㅣ가-힣]+') msg_txt = hanguleng.sub('', msg_txt) # ㅋㅋ, ㅠㅠ, ㅎㅎ 등등 필터링 pattern = re.compile("[ㄱ-ㅎㅏ-ㅣ]+") msg_txt = re.sub(pattern, '', msg_txt).strip() except: print('msg_txt strip error') try: #print(msg_txt) if len(msg_txt) > 0: pos = kkma.pos(msg_txt) for keyword, type in pos: # 고유명사 또는 보통명사 if type == "NNG" or type == "NNP": nouns.append(keyword) #print(msg_txt, "->", nouns) except: print("get_noun Error!") print(msg_txt) return nouns
def get_keywords(chat_list): kkma = Kkma() # print(len(chat_list)) except_word_list = [] except_keyword_list = [] in_result_data = [] print('형태소 분석 시작!') for idx in range(len(chat_list)): # if idx < 10 : # print(chat_list[idx].get('message')) replace_chat_message = re.sub( '\,', ',', re.sub('[\"\'‘“”″′]', '“', str(chat_list[idx].get('message')))) result_message = regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{replace_chat_message}') for word in result_message: in_result_word = [] group = [] if (word not in except_word_list): word_g = [] word_g.append(word) group.append(word_g) for keyword in kkma.pos(word): if (keyword not in except_keyword_list): in_result_word.append(keyword) group.append(in_result_word) in_result_data.append(group) # print(f'[{idx} // {len(chat_list)}] 데이터 가공 완료') return in_result_data
def get_near_keyword(q_list, cursor): kkma = Kkma() # 제외할 단어 목록 global except_word_list fin_result = [] for idx, q in enumerate(q_list): question = [q[0], remove_html(q[1])] sentence = re.sub('[-=.#/?:$}\"\']', '', str(question[1])).replace('[', '').replace(']', '') origin_word_list = list( dict.fromkeys( regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{sentence}'))) # 형태소 분석 out_result = [] out_result.append(question[0]) result = [] for origin_word in origin_word_list: if (origin_word not in except_word_list): for morpheme in kkma.pos(origin_word): if (len(morpheme[0]) > 1) and (morpheme[1] == 'NNG' or morpheme[1] == 'NNP' or morpheme[1] == 'NNB' or morpheme[1] == 'NNM'): in_result = [] in_result.append(morpheme[0]) result.append(in_result) out_result.append(result) fin_result.append(out_result) for result in fin_result: qna_no = result[0] print(result) for morpheme in result[1]: morpheme_keyword = morpheme[0] # print(type(morpheme_keyword), morpheme_keyword) try: cursor.execute(f""" SELECT TARGET_MORPHEME_WORD FROM TBL_CCQ_KEYWORD_MAP WHERE SOURCE_WORD = "{morpheme_keyword}" ORDER BY WORD_DISTANCE DESC LIMIT 1 """) keyword = cursor.fetchall() if len(keyword) > 0: print(f'"{morpheme_keyword}"과(와) 연관단어 > "{keyword[0][0]}"') except Exception as e: print(f'****** + error! >> {e} >>>>> SELECT 오류!') continue finally: pass
def _kkma_parse(self, str_arr, tag_combine=True): """ :param h5file: :return: """ kkma = Kkma() return_arr = [] for data in str_arr: return_arr = return_arr + self._flat(kkma.pos(str(data)), tag_combine=tag_combine) return return_arr
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def get_vs(line): korean_analyzer = Kkma() return [word for word, tag in korean_analyzer.pos(line) if tag in ['VA', 'VX', 'VC']]
from konlpy.tag import Kkma from konlpy.utils import pprint from openpyxl import load_workbook, Workbook dic={} kkma=Kkma() wb=load_workbook(filename='menulist.xlsx',read_only=True) ws=wb['Sheet1'] for i in range(1,5897): for l,k in kkma.pos(ws['A'+str(i)].value): if l not in dic.keys(): dic[l]=0 else: dic[l]+=1 wb=Workbook() dest_filename="determine.xlsx" ws1=wb.active ws1.title="result" num=1 for l,k in dic.items(): ws1['A'+str(num)]=l ws1['B'+str(num)]=k num+=1 wb.save(filename=dest_filename)
from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다')); pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.')); pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
#!/usr/bin/python # vim: set fileencoding=utf8 : from konlpy.tag import Kkma from konlpy.utils import pprint from convert import convert import fileinput kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.')) #poss = kkma.pos(u'(오류보고는) "실행환경", 에러메세지와함께 설명을 최대한상세히!^^') for line in fileinput.input(): poss = kkma.pos(unicode(line, "utf-8")) for tup in poss: print tup[0], print convert(tup[1])
kkma = Kkma() stopWord_Ingre = {"재료" , "계량법" , "안내" , "조금"} # In[113]: mystr = getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6879000_6880000") mystr += getString("/home/gwangjik/문서/hanyang corps/데이터/만개의레시피/Text/text_recipe10000_6870000_6871000") # In[ ]: tokenized = kkma.pos(mystr) # In[ ]: token_filtered = list(filter(lambda mytoken: mytoken[1] == "NNG" or mytoken == "NNG" or mytoken == "NNB" and not mytoken[0] in stopWord_Ingre, tokenized)) # In[ ]: embedding_model = Word2Vec(token_filtered , size=10, window = 3, min_count=0 , workers=3, iter=10, sg=1) # In[ ]:
l = [k.pos(lines[i]) for i in range(start, end)] result.append(l) return if __name__=="__main__": import time print('Number of lines in document:') k = Kkma() lines = kolaw.open('constitution.txt').read().splitlines() nlines = len(lines) print(nlines) print('Batch tagging:') s = time.clock() result = [] l = [k.pos(line) for line in lines] result.append(l) t = time.clock() print(t - s) print('Concurrent tagging:') result = [] t1 = Thread(target=do_concurrent_tagging, args=(0, int(nlines/2), lines, result)) t2 = Thread(target=do_concurrent_tagging, args=(int(nlines/2), nlines, lines, result)) t1.start(); t2.start() t1.join(); t2.join() m = sum(result, []) # Merge results print(time.clock() - t)
__author__ = 'woojin' # -*- coding: utf-8 -*- from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.')) pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.')) pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
def Training(): for article in article_list: # print(article) # title = article[0] # link = article[1] # newspaper = article[2] kkma = Kkma() try: content, issueDateTime = NateNews.get_content(article['link']) issueDateTime = pd.to_datetime(issueDateTime) # issueDate = time.strftime('%Y-%m-%d', issueDateTime) # issueTime = time.strftime('%H:%M:%S', issueDateTime) issueDate = issueDateTime.date() issueTime = issueDateTime.time() # 형태소 분석 # wordList = kkma.pos(content) # [보통명사 동사 형용사 보조동사 명사추정범주] 필터링 # print(title) # print('wordList : ', wordList) # print(issueDateTime) # print(link) # print(newspaper) # print(issueDate) # print('wordList : ', wordList) wordList = list(getWords(kkma.pos(content))) ws = set(wordList) print('ws : ', ws) dic = {} for word in ws: print('word : ', word) dic.update({word: wordList.count(word)}) print('dic : ', dic) n = 10 listdic = sorted(dic.items(), key=operator.itemgetter(1), reverse=True)[:n] print('listdic : ', listdic) for l in listdic: print('l : ', l) wordList.append(l[0]) baseDate = '' if issueTime > pd.to_datetime('15:30:00').time(): # print('장 마감 이후') baseDate = stockDF[stockDF['datetime'] > issueDate].head(1)['datetime'] else: # print('장 마감 이전') baseDate = stockDF[stockDF['datetime'] >= issueDate].head(1)['datetime'] print('issueTime : ', issueTime) print('baseDate : ', baseDate) # print(type(baseDate)) if issueDate > pd.to_datetime(testSetFromDate).date() or len(baseDate) == 0: # test set testEntry.append({'issueDateTime': issueDateTime, 'wordList': wordList}) else: # trainning set baseDate = pd.Series(baseDate).values[0] # print('해당 일자 주식 확인 : ', baseDate) trainingSet.append({'issueDateTime': issueDateTime, 'wordList': wordList}) print(trainingSet) # print(int(stockDF[stockDF['날짜'] == baseDate]['종가'])) # print(int(stockDF[stockDF['날짜'] < baseDate].tail(1)['종가'])) todayPrice = int(stockDF[stockDF['datetime'] == baseDate]['close']) prevPrice = int(stockDF[stockDF['datetime'] < baseDate].tail(1)['close']) if (todayPrice > prevPrice): # print(baseDate, ' : up') classList.append(1) else: if (todayPrice < prevPrice): # print(baseDate, ' : down') classList.append(0) else: # print(baseDate, ' : hold') classList.append(0) except: pass