def morphing(content): mecab = Mecab() morphList = [] for word in mecab.nouns(content): if word not in stop_word: morphList.append(word) return morphList
def callback(ch, method, properties, body, id): mecab = Mecab() print(" [%d] Received %s" % (ch, body.decode('utf-8'))) noun_list = mecab.nouns(body.decode('utf-8')) send_message_to_database(id, noun_list)
def count(request): full_text = request.GET['fulltext'] tagger = Mecab() #형태소 분석기 word_list = tagger.nouns(full_text) #명사만 추출하기 return은 dict형태 #word_list = full_text.split() word_dictionary = {} for word in word_list: if Words.objects.filter(text=word): word = Words.objects.get(text=word) word.frequency_total += 1 word.save() else: word = Words(text=word, frequency_total=1) word.save() if word in word_dictionary: word_dictionary[word] += 1 else: word_dictionary[word] = 1 return render( request, 'wordcount/count.html', { 'fulltext': full_text, 'total': len(word_list), 'word_dictionary': word_dictionary.items() })
def clean_str(s): """Clean sentence""" global counter_konlpy global total_dataset #global stopwords s = re.sub('[0-9]', '', s) s = preprocess(s) mecab = Mecab() #print(' '.join(kkma.nouns(s))) result = [] result = mecab.nouns(s) #temp = [] #temp = mecab.nouns(s) #for noun in temp: #flag = 0; #for sword in stopwords: #if noun == sword: #flag = 1; #break; #if flag == 0: #result.append(noun) if len(result) > 1000: result = result[0:1000] counter_konlpy += 1 #sys.stdout.write("\rParsed: %d / %d" %(counter_konlpy, total_dataset)) #sys.stdout.flush() return ' '.join(result)
def preprocess(datapath): mecab = Mecab() cnt = 0 sentences = [] f = open('./data/korean_corpus.txt', 'r', encoding="utf8") while True: line = f.readline() if not line: break cnt += 1 if not (cnt % 1000): print("tokenize {}kth line...".format(cnt // 1000), end='\r') tokens = mecab.nouns(line) if tokens: sentences.append(tokens) print("") cnt = 0 with open(datapath, 'w') as f: for sentence in sentences: cnt += 1 for idx, word in enumerate(sentence): if idx == len(sentence) - 1: f.write("%s.\n" % word) else: f.write("%s " % word) if not (cnt % 1000): print("write {}kth line to the file...".format(cnt // 1000), end='\r') print("")
def news_makefile(): global links global num global newstext article = newspaper.Article(links[num], language='ko') article.download() article.parse() newstext = article.text engine = Mecab() nouns = engine.nouns(newstext) nouns = [n for n in nouns if len(n) > 1] count = Counter(nouns) tags = count.most_common(20) text = " 제목은 " + article.title + " 입니다. " + "키워드는 " + str( tags[0][0]) + " " + str(tags[1][0]) + " " + str( tags[2][0]) + " " + str(tags[3][0]) + " " + str(tags[4][0]) tts = gTTS(text + "입니다. 이 기사를 읽으려면 2번, 다음 기사의 키워드는 3번, 분야선택(홈)은 1번 입니다", lang='ko') if os.path.isfile('keyword.mp3') == True: os.remove('keyword.mp3') tts.save('keyword.mp3') wc = WordCloud(font_path='c:\\windows\\fonts\\NanumSquareR.ttf', background_color='white', width=500, height=400) cloud = wc.generate_from_frequencies(dict(tags)) fig = plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud) fig.savefig('keyword.jpg') Image.open('keyword.jpg').resize((700, 650)).save('keyword.jpg')
def news_makefile(): global links global num global news_text article = newspaper.Article(links[num], language='ko') article.download() article.parse() news_text = article.text headline = article.title engine = Mecab() nouns = engine.nouns(news_text) nouns = [n for n in nouns if len(n) > 1] count = Counter(nouns) tags = count.most_common(15) print(headline, tags) text = " 제목은 " + headline + " 입니다. " + "키워드는 " " " + str( tags[0][0]) + str(tags[1][0]) + str(tags[2][0]) + str( tags[3][0]) + str(tags[4][0]) tts = gTTS(text + "입니다. 이 기사를 읽으려면 5번, 다음 기사의 키워드는 6번, 분야선택(홈)은 0번 입니다", lang='ko') tts.save('keyword.mp3') font_path = 'c:\\windows\\fonts\\NanumGothic.ttf' wc = WordCloud(font_path=font_path, background_color='white', width=500, height=400) cloud = wc.generate_from_frequencies(dict(tags)) fig = plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud) fig.savefig('keyword.jpg') image = Image.open('keyword.jpg').resize((500, 400)).save('keyword.jpg') tts2 = gTTS("선택하신 기사 내용은 " + news_text, lang='ko') tts2.save("news_all.mp3")
def find_most_mentioned(self): """ Returns how many times that people has been mentioned in conversation :return: dictionary containing person's name and number of mentions """ self._rewind() # get all conversations all_conversations = self.get_all_conversations() # word parser objects mecab = Mecab() kkma = Kkma() # parse all conversation words, and get only nouns all_nouns = list() for conversation in all_conversations: all_nouns += mecab.nouns(conversation) # exclude family name(성) from name names_list = list() for name in self.get_all_names(): preprocessed_name = kkma.nouns(name) for data in preprocessed_name: if len(data) != 1: names_list.append(data) # compare two list mentioned_people = [ person for person in all_nouns if person in names_list ] # count using Counter and return cnt = Counter(mentioned_people) return cnt.most_common(len(cnt))
def get_corpus(data): """Make corpus with string or list data :param str,list data: String Data (One post per line) | List Data (One post per element) :return: corpus (numpy.ndarray) """ nlp = Mecab( '/usr/local/lib/mecab/dic/mecab-ko-dic') # Make Mecab (NLP) Instance if type(data) == list: post_list = data # Case: DataType is list and Each element is string else: post_list = data.split('\n') # Split string data by line (\n) cleaned_post_list = [] for post in post_list: # Cleansing Process (Only get nouns) post = re.sub( r'\W', ' ', post ) # Change Special Characters and blanks (Not words) to ' '(one blank) post = ' '.join(nlp.nouns( post)) # nlp.nouns returns nouns list. so, unpacking list (to str) if post: # if post is null, Not add cleaned_post_list.append( post) # Append cleaned post(only nouns) to cleaned_post_list corpus = np.array(cleaned_post_list) return corpus
class SentenceTokenizer(object): def __init__(self): try: self.mecab = Mecab() except: self.mecab = Mecab(dicpath="C:/mecab/mecab-ko-dic") self.stopwords = ['뉴스','연합', '자료사진','서울연합','중인' ,'만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자" ,"아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가"] def text2sentences(self, text): sentences = sent_tokenize(text) res = list() for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: tmp = sentences[idx-1] + (' ' + sentences[idx]) sentences[idx] = '' if '.' in tmp: dot_idx = tmp.index('.') if dot_idx < len(tmp)-1 and (not tmp[dot_idx+1].isnumeric() or tmp[dot_idx+1] != ' '): res += tmp.split('.') else: res.append(tmp) else: res.append(tmp) pre_sentences = [elem for elem in res if len(elem) >= 1] return pre_sentences def get_nouns(self, sentences): nouns = [] for sentence in sentences: if sentence is not '': nouns.append(' '.join([noun for noun in self.mecab.nouns(str(sentence)) if noun not in self.stopwords and len(noun) > 1])) return nouns
def draw_wordcloud_from_url(url_link): # 웹 페이지를 가져온 뒤 BeautifulSoup 객체로 만듦 response = requests.get(url_link) soup = BeautifulSoup(response.content, 'html.parser') table = soup.find('table', { 'class': 'gall_list' }) # <table class="table_develop3">을 찾음 links = [] # 링크를 저장할 리스트 생성 news_text = '' for tr in table.find_all('tr', class_="ub-content"): # 모든 <tr.ub-content> 태그를 찾아서 반복(각 지점의 데이터를 가져옴) title = tr.find('td', class_="gall_tit") link = title.find('a').text links.append(link) article = newspaper.Article(link, language='ko') article.download() article.parse() news_text += article.text # konlpy, Mecab: 형태소 분석을 통해 본문에서 명사추출, 1글자는 단어는 삭제 engine = Mecab() nouns = engine.nouns(news_text) nouns = [n for n in nouns if len(n) > 1] # Counter: 단어수 세기, 가장 많이 등장한 단어(명사) 40개 count = Counter(nouns) tags = count.most_common(40) # WordCloud, matplotlib: 단어 구름 그리기 font_path = '/usr/share/fonts/truetype/nanum/NanumMyeongjoBold.ttf' wc = WordCloud(font_path=font_path, background_color='white', width=800, height=600) cloud = wc.generate_from_frequencies(dict(tags)) plt.figure(figsize=(10,8)) plt.axis('off') plt.imshow(cloud)
def get_clean_word(words, stopwords): nouns = [] #문장들을 받아서 제외시킬 문자만 빼고 리스트에 추가해 반환한다. tagger = Mecab(dicpath="C:\\mecab\\mecab-ko-dic") for post in words: for noun in tagger.nouns(post): if noun not in stopwords: nouns.append(noun) return nouns
def ko_lemmatize_nouns(inputString): ''' Input: string (Korean) Output: list of strings (Korean) ---------------------------------------------------------------------------- Returns list of nouns from the input. ''' mecab = Mecab() return mecab.nouns(inputString)
def lematization(self, texts): print(' ...Make lematization...') mecab = Mecab() texts_out = [] for sent in tqdm(texts): doc = " ".join(sent) texts_out.append(mecab.nouns(doc)) # print(texts_out[0]) return texts_out
class KoreaHelper(object): def __init__(self): from konlpy.tag import Mecab self.mecab = Mecab() def pos(self, phrase: Text): """ $ python -m sagas.ko.ko_helper pos '계획이' :param phrase: :return: """ return self.mecab.pos(phrase) def nouns(self, phrase: Text): """ $ python -m sagas.ko.ko_helper nouns '피자와 스파게티가' $ python -m sagas.ko.ko_helper nouns '계획이' :param phrase: :return: """ from sagas.nlu.transliterations import translits from sagas.ko.kwn_procs import kwn ns = self.mecab.nouns(phrase) rs = [] for w in ns: # ws = get_word_sets(w, 'ko') ws = kwn.get_synsets(w, first=True) if ws: rs.append({ 'spec': ws[0].name(), 'text': w, 'translit': translits.translit(w, 'ko'), 'definition': ws[0].definition() }) else: rs.append({ 'text': w, 'translit': translits.translit(w, 'ko'), }) return rs def translit(self, word): """ $ python -m sagas.ko.ko_helper translit '피자와 스파게티가' See also: procs-ko-konlpy.ipynb :param word: :return: """ from sagas.nlu.transliterations import translits for w, p in self.mecab.pos(word): expl = '_' if p in ('NNG', 'VV'): ws = get_word_sets(w, 'ko') if ws: expl = f"{ws['name']}({ws['definition']})" print(w, translits.translit(w, 'ko'), p, expl)
def get_tokens(x): mecab = Mecab() try: return [i for i in mecab.nouns(x) if len(i) > 1] if x else [] except Exception as e: if str(x) == 'nan': return [] print(e) print(str(x)) raise e
def text_tokenizing(self, doc): mecab = Mecab() SW = self.define_stopwords("./data/stopwords-ko.txt") if self.ui.rb_noun.isChecked(): return [word for word in mecab.nouns(doc) if word not in SW and len(word) > 1] elif self.ui.rb_morphs.isChecked(): return [word for word in mecab.morphs(doc) if word not in SW and len(word) > 1] elif self.ui.rb_words.isChecked(): return [word for word in mecab.words(doc) if word not in SW and len(word) > 1]
def noun_extraction(data_list): print("noun_extraction start") nouns_list = [] mecab = Mecab() for data in data_list: noun = mecab.nouns(data[0]) nouns_list.append([noun, data[1], data[2]]) # print(json.dumps(nouns_list, ensure_ascii=False, indent=3)) return nouns_list
def get_nouns(text): tagger = Mecab() keyword_list = [] noun = tagger.nouns(text) noun = [i for i in noun if len(i) > 1] noun = str(noun).replace('[', '').replace(']', '').replace(',', ' ').replace("'", '') return noun
def clean_str(s): """Clean sentence""" global counter_konlpy global total_dataset s = re.sub('[0-9]', '', s) mecab = Mecab() result = mecab.nouns(s) counter_konlpy += 1 sys.stdout.write("\rParsed: %d / %d" % (counter_konlpy, total_dataset)) sys.stdout.flush() return ' '.join(result)
def read_text(fin): # 전처리된 위키백과 파일을 읽어 들입니다. corpus_li = [] mecab = Mecab(dicpath='/opt/local/lib/mecab/dic/mecab-ko-dic') for line in open(fin): # 깨지는 글자를 처리하기 위해 unicodedata.normalize 함수를 이용해 # NFKC로변환합니다. line = unicodedata.normalize('NFKC', line) try: # 첫 글자가 숫자로 시작하는 문장을 말뭉치에 추가합니다. _ = int(line[0]) corpus_li.append(' '.join(mecab.nouns(line)) + '\n') except ValueError: # 첫 글자가 한글로 시작하는 문장을 말뭉치에 추가합니다. if ord(line[0]) >= ord('가') and ord(line[0]) <= ord('힇'): corpus_li.append(' '.join(mecab.nouns(line)) + '\n') else: pass print('# of lines in corpus', len(corpus_li)) return (corpus_li)
def to_nouns(docs, version): if version == 'mecab': parser = Mecab() else: parser = konlpy_import(version) nounslist = [] for doc in docs: try: nouns = ' '.join(parser.nouns(doc)) except: pass nounslist.append(nouns) return nounslist
def count(body): mecab = Mecab() if len(body) != 0: text = body[0].get_text() f = open("../keyword.txt", 'r') while True: line = f.readline() line = line.replace('\n', '') if not line: break text = text.replace(line, '부실') f.close() nouns = mecab.nouns(text) print(nouns.count('부실'))
def clean_str(s): """Clean sentence""" global counter_konlpy global total_dataset s = re.sub('[0-9]', '', s) #kkma = Kkma() # komoran = Komoran() # twitter = Twitter() mecab = Mecab() #print(' '.join(kkma.nouns(s))) result = [] for aLine in s.split(';'): result.append(' '.join(mecab.nouns(aLine))) counter_konlpy += 1 sys.stdout.write("\r Parsed: %d / %d" % (counter_konlpy, total_dataset)) sys.stdout.flush() return ' '.join(result)
def is_contained_bad_word(text, type): # BadWord 테이블에 type 조건으로 키워드 전체 조회 # konlpy 라이브러리로 체크 mecab = Mecab() mecab_list = mecab.nouns(text) # DB에서 제외키워드 조회 badword_list = BadWord.objects.filter(type=type).values_list('keyword', flat=True) for badword in badword_list: for mecabword in mecab_list: badword_en = badword.encode('utf-8') mecabword_en = mecabword.encode('utf-8') if badword_en == mecabword_en: return True # if any (word in BadWord.objects.all().__str__() for word in mecab.nouns(text)): # return True return False
def result(request): text = request.GET['fulltext'] nlpy = Mecab() nouns = nlpy.nouns(text) word_dictionary = {} for word in nouns: if word in word_dictionary: word_dictionary[word] += 1 else: word_dictionary[word] = 1 return render( request, 'result.html', { 'noun': nouns, 'full': text, 'total': len(nouns), 'dictionary': word_dictionary.items() })
def text_tokenize(corpus): mecab = Mecab() token_corpus = [] if w.ui.rb_noun.isChecked(): for n in range(len(corpus)): token_text = mecab.nouns(corpus[n]) token_text = [word for word in token_text if word not in SW] token_corpus.append(token_text) if w.ui.rb_morphs.isChecked(): for n in range(len(corpus)): token_text = mecab.morphs(corpus[n]) token_text = [word for word in token_text if word not in SW] token_corpus.append(token_text) if w.ui.rb_words.isChecked(): for n in range(len(corpus)): token_text = corpus[n].split() token_text = [word for word in token_text if word not in SW] token_corpus.append(token_text) return token_corpus
def emotion(): title = request.form['title'] comment = request.form['content'] uid = request.form['uid'] from konlpy.tag import Mecab mecab = Mecab("C:\mecab\mecab-ko-dic") token_data = [] token = mecab.nouns(str(comment)) token_data.append(token) series_token_data = pd.Series(token_data) # 모델 불러오기 fastText_model = fasttext.FastText.load("./embedding/tweet_fastText_0717.model") docs_vectors_ft = pd.DataFrame() for doc in series_token_data: temp = pd.DataFrame() for word in doc: ft = fastText_model[word] temp = temp.append(pd.Series(ft), ignore_index=True) # take the average of each column(w0, w1, w2,........w300) doc_vector_ft = temp.mean() # append each document value to the final dataframe docs_vectors_ft = docs_vectors_ft.append(doc_vector_ft, ignore_index=True) from sklearn.externals import joblib # pickled binary file 형태로 저장된 객체를 로딩한다 file_name = './embedding/tweet_bagg_SVM.pkl' model = joblib.load(file_name) pred = model.predict(docs_vectors_ft) # print("DB 연결중") conn = getConnection() curs = conn.cursor() sql = "UPDATE emo_board SET EBEMO = '"+pred[0]+"' WHERE EBTITLE = '"\ +title+"' AND EBCONTENT = '"+comment+"' AND UID = '"+uid+"'" curs.execute(sql) curs.close() conn.close() return "성공"
def preprocess( data_path: str, word_index: dict = None, num_words: int = 10000, ): tokenizer = Mecab() # 0. data load with open(data_path, "rb") as f: data = pickle.load(f) # 1. bag-of-words vocab, docs = [], [] for doc in tqdm(data): if doc: # nsmc 데이터에 nan값을 제외해주기 위함 try: nouns = tokenizer.nouns(doc) vocab.extend(nouns) docs.append(nouns) except: continue # 2. build vocab if not word_index: vocab = Counter(vocab) vocab = vocab.most_common(num_words) # 3. add unknwon token word_index = {"<UNK>": 0} for idx, (word, _) in enumerate(vocab, 1): word_index[word] = idx index_word = {idx: word for word, idx in word_index.items()} # 4. create corpus corpus = [] for doc in docs: if doc: corpus.append([word_index.get(word, 0) for word in doc]) return corpus, word_index, index_word
def Get_relevant_articles(search_string, num_of_articles=1000, date=time.strftime("%Y-%m-%d"), host='localhost', port=9200, index='ko_news_articles'): ''' Input: String (Required) max number of related articles to return date origin point parameter (default=now) ES host address ES port number index name Output: list of dictionaries containing the article infos, index id, similarity score, etc. ---------------------------------------------------------------------------- ''' mecab = Mecab() search_string_lem = ' '.join(mecab.nouns(search_string)) es = Elasticsearch([{'host': host, 'port': port}]) Output = es.search(index=index, size=num_of_articles, body={'query': \ {'function_score': \ {'query':\ {'dis_max':\ {'queries': [\ {'match': {'articleContents': {'query':search_string, 'fuzziness': 'AUTO', 'max_expansions':5, "cutoff_frequency" : 0.001}}},\ {'match': {'Lemmatized': {'query':search_string_lem, 'fuzziness': 0, 'max_expansions':2, "cutoff_frequency" : 0.001}}}],\ "tie_breaker":0.3\ }\ },\ "functions":\ [{"gauss":\ {"articleDate":\ {"origin":date,\ "scale":"30d",\ "offset":"2d",\ "decay": .5 \ }\ }\ }],\ "score_mode":"multiply"\ }\ }\ }) return Output['hits']['hits']