def __init__(self, split, dictionary, dataroot='data', tokenizer='sp'): super(KvqaFeatureDataset, self).__init__() assert split in ['train', 'val', 'test'] self.dataroot = dataroot ans2label_path = os.path.join(dataroot, 'cache', 'trainval_ans2label.kvqa.pkl') label2ans_path = os.path.join(dataroot, 'cache', 'trainval_label2ans.kvqa.pkl') self.ans2label = cPickle.load(open(ans2label_path, 'rb')) self.label2ans = cPickle.load(open(label2ans_path, 'rb')) self.num_ans_candidates = len(self.ans2label) self.dictionary = dictionary self.img_id2idx = cPickle.load( open(os.path.join(dataroot, '%s_imgid2idx.kvqa.pkl' % split), 'rb')) h5_path = os.path.join(dataroot, '%s_kvqa.hdf5' % split) print('loading features from h5 file') with h5py.File(h5_path, 'r') as hf: self.features = np.array(hf.get('image_features')) self.spatials = np.array(hf.get('spatial_features')) self.pos_boxes = np.array(hf.get('pos_boxes')) self.entries, self.type2idx, self.idx2type = _load_kvqa( dataroot, split, self.img_id2idx) if tokenizer == 'sp': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False) self.dictionary = self.tokenizer.vocab elif tokenizer == 'mecab': self.tokenizer = Mecab() elif tokenizer == 'kkma': self.tokenizer = Kkma() self.tokenize() self.tensorize() self.v_dim = self.features.size(1) self.s_dim = self.spatials.size(1)
def get_json(): import json articles = get_articles('http://m.news.naver.com/') kkma = Kkma() article_results = [] for article_dict in articles: article_result = dict() # article = Article(article_dict['url']) article.download() article.parse() # title = article_dict['title'] text = article.text top_img_url = article.top_image print title print top_img_url # sentences = trim_sentences(kkma.sentences(text)) target_index = forasterisk_algorithm(sentences) sentence = sentences[target_index].strip() print sentence article_result['title'] = title if article_dict[ 'img_url'] != u'http://mimgnews2.naver.net/image/navernews_200x200_new.jpg': article_result['img_url'] = article_dict['img_url'] article_result['sentence'] = sentence article_result['press'] = article_dict['press'] article_result['url'] = article_dict['url'] article_results.append(article_result) result = dict() result['articles'] = article_results return json.dumps(result)
def tokenizer(sentence: str) -> str: """텍스트를 토큰화 시킨다. Mecab을 사용해서 토큰화시킨다. Return: s (string): "안녕/NNG 부산/NNG ..." """ #### Mecab############## tag = Okt() tag = Komoran() tag = Hannanum() tag = Kkma() tag = Mecab() pos = tag.pos(sentence) temp = [] for p in pos: temp.append(p[0] + "/" + p[1]) s = ' '.join(temp) return s
def _mk_word_cloud_korean(self): target = ' '.join(self.text) kkma = Kkma() n = kkma.nouns(target) n = [temp for temp in n if len(temp) != 1 if not temp.isdecimal()] text = nltk.Text(n) data = text.vocab() data500 = data.most_common(500) dic = dict(data500) # Make word cloud object wc = WordCloud(font_path='/Library/Fonts/Arial Unicode.ttf', max_font_size=80, min_font_size=10, background_color=self.color, mask=self.mask) self.wordcloud = wc.generate_from_frequencies(dic)
def get_wordcloud(dbconn, cursor): kkma = Kkma() except_word_list = [] except_keyword_list = [] in_result_data = [] today_date = datetime.today().strftime("%Y-%m-%d") newsList = TblTotalCarNewsList.objects.all().filter( write_date__icontains=today_date) news_title_group = [] for idx in range(len(newsList)): news_title_group.append( remove_sc( remove_html(newsList.values()[idx].get('news_title'))).replace( '&44;', '').replace('&8220;', '').replace('\r', '').replace('\n', '')) get_morpheme_words(get_morphemes(news_title_group), today_date)
def preprocessing(data): try: from konlpy.tag import Okt, Kkma import khaiii khaiii_api = khaiii.KhaiiiApi(opt.khaiii_so_path) khaiii_api.open(opt.khaiii_path) kkma = Kkma() kkma_tokenizer = kkma.nouns twitter = Okt() okt_tokenizer = twitter.nouns cls, data_path_list, div, out_path, begin_offset, end_offset = data data = cls() data.load_y_vocab() data.preprocessing(data_path_list, div, begin_offset, end_offset, out_path, okt_tokenizer, khaiii_api, kkma_tokenizer) except Exception: raise Exception("".join(traceback.format_exception(*sys.exc_info())))
def predict(text, model, label_list): assert isinstance(text, str) model.eval() kkma = Kkma() text = utils.clean_str(text) w2vec = KeyedVectors.load_word2vec_format('../kor-word2vec-kkma-200.bin', binary=True) tokens = [ p[0] + '/' + p[1] for p in kkma.pos(text) if p[0] + '/' + p[1] in w2vec.vocab ] embed = parser.sentenceToEmbedding(tokens, 112, w2vec) x = torch.from_numpy(embed).float() x = x.view(1, x.size()[0], x.size()[1]) with torch.no_grad(): logit = model(x) logit = F.softmax(logit, dim=1) _, predicted = torch.max(logit, 1) return parser.idxToLabel(predicted[0], label_list)
def insert_qna_content(dbconn, cursor): file_path = f'qna_set.xlsx' load_wb = load_workbook(file_path, data_only=True) load_ws = load_wb['chat4'] all_values = [] for row in load_ws.rows: row_value = [] for cell in row: row_value.append(cell.value) all_values.append(row_value) kkma = Kkma() for idx, values in enumerate(all_values): q_text = values[0] a_text = values[1] q_type = values[2] q_nouns = kkma.nouns(q_text) if len(q_nouns) > 0: q_nouns = str(q_nouns) else: q_nouns = '[]' print(q_nouns) try: cursor.execute(f""" INSERT IGNORE INTO TBL_QNA_CHAT_SET_LIST ( Q_TEXT, A_TEXT, Q_MORPHEMES, Q_TYPE, UPDATE_DATE ) VALUES ( "{q_text}", "{a_text}", "{q_nouns}", {q_type}, NOW() ) """) except Exception as e: print(f'error! >> insert_qna_content >> {e}') finally: print( f'[{idx}/{len(all_values)}({round((idx / len(all_values) * 100), 2)}%)] complete!!' ) dbconn.commit() time.sleep(0.1)
def __init__(self, nlpEngine = "Mecab"): '''e 원하는 형태소 분석기 엔진으로 형태소 분석기 생성 :param nlpEngine: 형태소 분석기 이름(첫글자 대문자) str ''' self.nlpEngine = nlpEngine if nlpEngine == "Okt": self.nlp = Okt() elif nlpEngine == "Komoran": self.nlp = Komoran() elif nlpEngine == "Kkma": self.nlp = Kkma() elif nlpEngine == "Hannanum": self.nlp = Hannanum() elif nlpEngine == "Mecab": self.nlp = Mecab() elif nlpEngine == "Twitter": self.nlp = Twitter() else: raise NameError("unknown nlp name")
def __init__(self, name, line_analyze=None): self.name = name self.talkdays = [] self.people = People() self._words = Words() self.tot_msg = 0 self.tot_person = {} self.line_analyze = None if line_analyze == 'Kkma': try: from konlpy.tag import Kkma self.kkma = Kkma() self.line_analyze = self.kkma_analyzer except: print("Please install konlpy packge.") line_analyze = None if not line_analyze: self.line_analyze = self.line_spliter
def makingpos(katoc): pos_data = { 'N': '', 'V': '', 'M': '', 'I': '', 'J': '', 'E': '', 'X': '', 'S': '', 'U': '', 'O': '' } kkma = Kkma() temp = kkma.pos(katoc) for i in temp: pum = i[1] #품사의 첫번쨰 글 pos_data[pum[0]] = i[0] #딕셔너리에 넣기 return pos_data
def AC_mapping(ac_list): new_ac_list = [] family_list = ['엄마','아빠','할머니','할아버지','아이','아들','딸','조부',\ '가족','친척','조카','부모님','유아','어머니','아버지','아기','어른'] couple_list = [ '연인', '여자친구', '여친', '남자친구', '남친', '애인', '신랑', '부인', '여자', '남자' ] friend_list = [ '여동생', '남동생', '오빠', '형님', '형', '친구', '누나', '언니', '누나', '동생' ] for ac in ac_list: words = Kkma().pos(ac) for word in words: if (word[1] not in ['NNM', 'NR', 'JC' ]) and (word[0] != '명'): if word[0] in family_list: new_ac_list.append('가족') if word[0] in couple_list: new_ac_list.append('연인') if word[0] in friend_list: new_ac_list.append('친구') if '혼자' in word: new_ac_list.append('혼자') return list(set(new_ac_list))
def get_nouns(self, text, isPositive, keyword): spliter = Kkma() isnouns = ['NNG', 'NNP'] tags = spliter.pos(text) # 긍정일 때 if isPositive == 1: for i in tags: if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword: self.nouns.append(i[0]) self.positive_nouns.append(i[0]) # 부정일 때 elif isPositive == -1: for i in tags: if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword: self.nouns.append(i[0]) self.negative_nouns.append(i[0]) else: for i in tags: if i[1] in isnouns and len(i[0]) > 1 and i[0] not in keyword: self.nouns.append(i[0])
def get_noun(msg_txt): kkma = Kkma() nouns = list() # ㅋㅋ, ㅠㅠ, ㅎㅎ 등등 필터링 pattern = re.compile("[ㄱ-ㅎㅏ-ㅣ" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F1E0-\U0001F1FF" # flags (iOS) "]+") ## 1000개 게시물 했을 때 팅김 현상 해결하기 msg_txt = re.sub(pattern, "", msg_txt).strip() if len(msg_txt) > 0: pos = kkma.pos(msg_txt) for keyword, type in pos: # 고유명사 또는 보통명사 if type == "NNG" or type == "NNP": nouns.append(keyword) #print(msg_txt, "->", nouns) return nouns
def dataset_iterator(example_dict, word_dict, verb_dict, batch_size): tokenizer = Kkma() left_context, verbs, right_context = [], [], [] for word, example in example_dict.items(): # split data l_c, vb, r_c = example[0], word, example[1] # convert to indices l_c = [word_dict[word[0]] if word[0] in word_dict else word_dict[UNK] for word in tokenizer.pos(l_c)] vb = verb_dict[vb] r_c = [word_dict[word[0]] if word[0] in word_dict else word_dict[UNK] for word in tokenizer.pos(r_c)] # add to list left_context.append(l_c) verbs.append(vb) right_context.append(r_c) # yield batched dataset if len(left_context) == batch_size: yield build_batch_dataset(left_context, verbs, right_context) left_context, verbs, right_context = [], [], [] if len(left_context) > 0: yield build_batch_dataset(left_context, verbs, right_context)
def __init__(self, srl='framenet', language='ko', only_lu=True): self.srl = srl self.language = language self.only_lu = only_lu if self.language == 'ko': from konlpy.tag import Kkma self.kkma = Kkma() with open(target_dir + '/data/targetdic-1.1.json', 'r') as f: targetdic = json.load(f) self.targetdic = targetdic else: import nltk self.lemmatizer = nltk.WordNetLemmatizer() self.pos_tagger = nltk.pos_tag with open(target_dir + '/data/targetdic-FN1.7.json', 'r') as f: targetdic = json.load(f) self.targetdic = targetdic
def parse_konlpy(self, text): from konlpy.tag import Kkma kkma = Kkma() # -- from konlpy.tag import Twitter twitter = Twitter() # -- sentence_list = kkma.sentences(text) # -- parsing = [] for sentence in sentence_list: parsed_sentence = {} # -- parsed_sentence['text'] = sentence # -- parsed_sentence['morp'] = kkma.pos(sentence) # -- parsed_sentence['phrase'] = twitter.phrases(sentence) # -- parsing.append(parsed_sentence) # -- return parsing
def remove_particle(training_args): """ remove particle Args: training_args """ # load tokenizer mecab = Mecab() kkma = Kkma() hannanum = Hannanum() # load prediction file with open(os.path.join(training_args.output_dir, "predictions.json"), "r") as f: prediction_json = json.load(f) prediction_dict = dict() for mrc_id in prediction_json.keys(): final_predictions = prediction_json[mrc_id] pos_tag = mecab.pos(final_predictions) # 조사가 있는 경우 삭제 if final_predictions[-1] == "의": min_len = min(len(kkma.pos(final_predictions)[-1][0]), len(mecab.pos(final_predictions)[-1][0]), len(hannanum.pos(final_predictions)[-1][0])) if min_len == 1: final_predictions = final_predictions[:-1] elif pos_tag[-1][-1] in { "JX", "JKB", "JKO", "JKS", "ETM", "VCP", "JC" }: final_predictions = final_predictions[:-len(pos_tag[-1][0])] prediction_dict[str(mrc_id)] = final_predictions # save final results with open(os.path.join(training_args.output_dir, "final_predictions.json"), 'w', encoding='utf-8') as make_file: json.dump(prediction_dict, make_file, indent="\t", ensure_ascii=False) print(prediction_dict)
def __init__(self, region): # Region can only be 'gs' or 'jl' assert region == 'gs' or region == 'jl', 'region should be \'gs\' or \'jl\'' self.kkma = Kkma() self.region = region self.sent_dict = dict() self.word_dict = dict() self.sent_dict_subword = dict() self.word_dict_subword = dict() self.additional_rule_dict = dict() self.sentence_data_filename = os.path.join( parent_path, 'data/sent_' + region + '_train.json') self.word_data_filename = os.path.join( parent_path, 'data/word_' + region + '_train.json') self.sentence_data_ex_filename = os.path.join( parent_path, 'data/ex/sent_' + region + '_train.json') self.word_data_ex_filename = os.path.join( parent_path, 'data/ex/word_' + region + '_train.json') self.sentence_dict_filename = os.path.join( current_path, 'save/statistical_sent_dict_' + region + '.json') self.word_dict_filename = os.path.join( current_path, 'save/statistical_word_dict_' + region + '.json') self.sentence_dict_ex_filename = os.path.join( current_path, 'save/ex/statistical_sent_dict_' + region + '.json') self.word_dict_ex_filename = os.path.join( current_path, 'save/ex/statistical_word_dict_' + region + '.json') self.additional_rule_filename = os.path.join( current_path, 'save/additional_rule_' + region + '.json') # If there is a dictionary created if os.path.isfile(self.sentence_dict_filename) and os.path.isfile(self.word_dict_filename) and \ os.path.isfile(self.sentence_dict_ex_filename) and os.path.isfile(self.word_dict_ex_filename): self.load_dict() print('Load dictionary for %s' % self.region) else: self.create_dict() print('Create and load dictionary for %s' % self.region)
def tag_all_reviews(norm, stem): kkma = Kkma() recommend_categories = set() nouns = dict() for filename in glob.glob('reviews/*.json'): with open(filename, 'r') as raw_file: print('parsing %s...' % filename) raw_data = json.load(raw_file) for review in raw_data: raw_tags = kkma.pos(review['text']) review['tagged'] = list() for tag in raw_tags: if tag[1][0] in ['N', 'V']: review['tagged'].append(tag) if tag[1][0] == 'N': if tag[0] in nouns: nouns[tag[0]] += 1 else: nouns[tag[0]] = 0 recommend_categories.update(list(review['recommend'].keys())) new_filename = 'tagged_reviews/%s' % filename.split('/')[1] with open(new_filename, 'w') as tagged_file: json.dump(raw_data, tagged_file, ensure_ascii=False, sort_keys=True, indent=2, separators=(',', ': ')) c = 0 with open('nouns.csv', 'w') as nouns_file: nf = csv.writer(nouns_file) for key in nouns.keys(): if nouns[key] >= 100: c += 1 nf.writerow([key, nouns[key]]) print(c) return recommend_categories
def pos_tag(sentences, labels): kkma = Kkma() significant_tags = [ 'NNG', 'NNP', 'NNB', 'VV', 'VA', 'VX', 'MAG', 'MAJ', 'XSV', 'XSA' ] # 일반 명사, 고유 명사, 의존 명사, 동사, 형용사, 보조 용언, 일반 부사, 접속 부사, 동사 파생 접미사, 형용사 파생 접미사 s, l = [], [] for sent, label in zip(sentences, labels): tmp = [] for word, tag in kkma.pos(sent): print(word + tag + ' ') if tag in significant_tags: tmp.append(word + '/' + tag) s.append(stemming_text(tmp)) l.append(label) result = pd.DataFrame([x for x in zip(s, l)], columns=['sentences', 'labels']) return result
def post_action(): if request.method == 'POST': #json 데이터를 받습니다. json_data = request.get_json() print('json_data : {json_data}') #꼬꼬마 객체를 생성 kkma = Kkma() #api 함수를 사용해 명사를 추출합니다 nouns_list = kkma.nouns(json_data['sentence']) print(nouns_list) #list 자료구조를 json 형식으로 변환합니다. nouns_json = json.dumps(nouns_list) nouns_json = '{"result":' + nouns_json + "}" print(nouns_json) #명사들을 return 해줍니다 return nouns_json return 'none POST!'
def ttr_check(txt): # kkma 객체 생성 kkma = Kkma() # 형태소 및 태그 추출 pos = kkma.pos(txt) # 빈도 카운트 및 저장(dict) count = Counter(pos) # pprint(count) # token ttr_token = sum(count.values()) # type ttr_type = len(count.keys()) # TTR ttr = (ttr_type / ttr_token) * 100 # 출력 # print(ttr_token, ttr_type) # print('TTR은 : {} 입니다.'.format(ttr)) return round(ttr, 2)
def set_tagger(self, tagger): tag = None if tagger == "mecab": tag = Mecab() elif tagger == 'komoran': from konlpy.tag import Komoran tag = Komoran() elif tagger == 'kkma': from konlpy.tag import Kkma tag = Kkma() elif tagger == 'hannanum': from konlpy.tag import Hannanum tag = Hannanum() elif tagger == 'okt': from konlpy.tag import Okt tag = Okt() elif tagger == 'twitter': from konlpy.tag import Twitter tag = Twitter() return tag
def make_content(url_list, news_content_list, content_summarize_list, title_list): for url in url_list: try: kkma = Kkma() news =Article(url, language = 'ko') news.download() news.parse() title_list.append(news.title) news.text = kkma.sentences(news.text) news.text = " ".join(news.text) news_content_list.append(news.text) # print(news.text) # print(type(news.text)) summary_content = summarize(news.text, word_count=100, ratio= 0.5) if summary_content: content_summarize_list.append(summary_content) else: content_summarize_list.append("요약 할 기사의 내용이 없습니다.") except Exception as e: print("exceptions is ", e) pass
def remove_josa(phrase): from konlpy.tag import Kkma kkma = Kkma() import jpype jpype.attachThreadToJVM() tokens = phrase.split(' ') result = [] for i in range(len(tokens)): token = tokens[i] if i < len(tokens)-1: result.append(token) else: m = kkma.pos(tokens[i]) if m[-1][-1].startswith('J'): m.pop(-1) token = ''.join([t for t,p in m]) result.append(token) result = ' '.join(result) return result
def get_morphemes(post_detail_cont) : kkma = Kkma() results = [] except_word_list = [] # 특수문자 제거 detail = remove_sc(str(post_detail_cont)) # 어절 분리 > list origin_word_list = list(dict.fromkeys(regex.findall(r'[\p{Hangul}|\p{Latin}|\p{Han}|\d+]+', f'{detail}'))) if len(origin_word_list) > 100 : del origin_word_list[100 : len(origin_word_list)] # origin_word_list = ['안녕하세요', '문의사항이', '있습니다'] for origin_word in origin_word_list : if (origin_word not in except_word_list) : for morpheme in kkma.pos(origin_word) : in_result = [] in_result.append(origin_word) in_result.append(morpheme) results.append(in_result) # results = [['자동차', ('자동차', 'NNG')], ['고장', ('고장', 'NNG')], ['진단', ('진단', 'NNG')], ['APP', ('APP', 'OL')], ['개발', ('개발', 'NNG')], ['데이터', ('데이터', 'NNG')], ['수집', ('수집', 'NNG')]] return results
def analyseText(text_data, results_file_name="results.txt"): print("\n---------------------------------------------") print("Step2 : 단어별 형태소 및 빈도를 분석합니다... 기다려 주세요") kkma = Kkma() data_pos = kkma.pos(text_data) data_arr = [] stop_words_file = open("stop_words.txt", "r", encoding="utf-8") stop_words = [ x.replace("\n", "").strip() for x in stop_words_file.readlines() ] stop_words_file.close() print("명사만 필터링하는 중...") for word_pos in data_pos: word = word_pos[0] pos = word_pos[1] if pos == "NNG" or pos == "VV" or pos == "VA": #명사만 필터링함. 동사도 포함하려면 or pos=="VV" (VA는 형용사) 추가할 것 data_arr.append(word) print("단어별 발생빈도를 정렬하고 파일에 저장하는 중...") counter = Counter(data_arr).most_common() keywords_and_frequency_for_wc = {} keywords_and_frequency = [] print("한 글자 이상 단어, 빈도수 2 이상인 것만 필터링하는 중...") for keyword in counter: word = keyword[0] freq = keyword[1] if len( word ) > 1 and freq > 2 and word not in stop_words: #한 글자 이상 단어 + 빈도수가 2 이상 + 불용어가 아닌 것만 추출 keywords_and_frequency_for_wc[word] = freq keywords_and_frequency.append({"단어": word, "빈도": freq}) df = pd.DataFrame(keywords_and_frequency) df.to_excel(excel_writer=results_file_name) print("형태소 및 빈도 분석 완료!") return keywords_and_frequency_for_wc
def crawler(base, root): # return dictionary of {sentence:tag} import requests from bs4 import BeautifulSoup # get into each link in the index page page = requests.get(root) soup = BeautifulSoup(page.text, 'html.parser') links = soup.select('body .mw-category-group a') linkurls = [link for link in links] # get information of the book & get text ret = {} txt = '' for linkurl in linkurls: try: print('getting into ' + linkurl.text) linkurl = linkurl['href'] page = requests.get(base + linkurl) soup = BeautifulSoup(page.text, 'html.parser') title = soup.find('span', {'id': 'header_title_text'}).text author = soup.find('span', {'class': 'fn'}).text tag = title.replace(" ", '') + author.replace(" ", '') txt = soup.select(' .mw-parser-output p')[:-1] except Exception as e: print(e, title, author) continue for string in txt: string = string.text.replace("\n", '') sentencer = Kkma() sentence = sentencer.sentences(string) #refine string for s in sentence: s = sentenceModifierSTR(s) ret[s] = tag #print(s + 'appended') print('Store Done for' + tag) return ret
def makeCloudTag(file_name): df = pd.read_excel(file_name) contents = list(df.content_main) result = "" for content in contents: result = result + str(content) + " " kkma = Kkma() m = result.split("\r\n") nouns_list = [] for a in m: nouns_list.append(kkma.nouns(a)) nouns_list_s = [] for i in nouns_list: for j in i: nouns_list_s.append(j) c = collections.Counter(nouns_list_s) x = c.most_common(100) d = make_tags(x, maxsize=100) create_tag_image(d, "star.jpg", size=(1000, 500), fontname='hangle')