from collections import Counter from konlpy.corpus import kolaw from konlpy.tag import Hannanum from konlpy.plot import draw_zipf from konlpy.utils import concordance, pprint from matplotlib import pyplot def draw_zipf(count_list, filename, color='blue', marker='o'): sorted_list = sorted(count_list, reverse=True) pyplot.plot(sorted_list, color=color, marker=marker) pyplot.xscale('log') pyplot.yscale('log') pyplot.savefig(filename) doc = kolaw.open('constitution.txt').read() pos = Hannanum().pos(doc) cnt = Counter(pos) print 'nchars :', len(doc) print 'ntokens :', len(doc.split()) print 'nmorphs :', len(set(pos)) print '\nTop 20 frequent morphemes:'; pprint(cnt.most_common(20)) print '\nLocations of "대한민국" in the document:' concordance(u'대한민국', doc, show=True) draw_zipf([b for a, b in cnt.items()], 'zipf.png')
def Pumsa(words): pprint(words) first = words[0][1] last = words[len(words) - 1][1] #단어를 형태소 단위로 분리했을 때 마지막 품사 ans = 0 #S if (last in ['jcs', 'jxc']): ans = 1 #V elif (last == 'ef'): ans = 2 elif (first in ['pvd','pvg']): ans = 2 #O elif (last == 'jco'): ans = 3 #C elif (last == 'jcc'): ans = 4 #A elif (last == 'jcm'): ans = 5 return ans
def get_text(url): source_code_from_URL = urllib.request.urlopen(url) # BeautifulSoup개체 : URL을 2번째 매개변수 기준으로 파싱. 'lxml'은 xml 라이브러리. 내장 파서인 html.parser도 가능 soup = BeautifulSoup(source_code_from_URL, 'lxml', from_encoding='utf-8') text = '' for title in soup.find_all('h3',{'id':'articleTitle'}): children = title.children for a in children: if type(a) == bs4.element.NavigableString: if len(a) > 1: text = text + a + '\n\n\n' print(a) pprint(twt.pos(a)) text = text.replace(u'\xa0', u'') for item in soup.find_all('div', {'id': 'articleBodyContents'}): # print(item) #이 선택자의 html 나옴 children = item.children for a in children: if type(a) == bs4.element.NavigableString: if len(a) > 1: text = text + a + '\n' print(a) pprint(twt.pos(a)) for pic in item.find_all('span', {'class': 'end_photo_org'}): print(pic.img) text = text.replace(u'\xa0', u'') return text
def write_tweets_to_files(tweet): if self.options.remove_links: tweet = delete_links(tweet) if self.options.remove_mentions: tweet = delete_mentions(tweet) word_count = 0 if not self.options.output_as_onefile: # counts how many targeting words included in one tweet. for word in self.words: word_count += tweet.count(word) filename = "{}{}{}.{}".format( self.dirname, self.options.output_prefix, word_count, self.options.output_extension ) n_word_file = io.open(filename, 'a', encoding='utf-8') n_word_file.write(tweet) n_word_file.write("\n") if self.options.verbose: for word in self.words: tweet = (colorama.Fore.CYAN + word).join(tweet.split(word)) tweet = (word + colorama.Fore.RESET).join(tweet.split(word)) pprint(word_count, tweet)
def write_tweets_to_files(tweet): if self.options.remove_links: tweet = delete_links(tweet) if self.options.remove_mentions: tweet = delete_mentions(tweet) word_count = 0 if not self.options.output_as_onefile: # counts how many targeting words included in one tweet. for word in self.words: word_count += tweet.count(word) filename = "{}{}{}.{}".format(self.dirname, self.options.output_prefix, word_count, self.options.output_extension) n_word_file = io.open(filename, 'a', encoding='utf-8') n_word_file.write(tweet) n_word_file.write("\n") if self.options.verbose: for word in self.words: tweet = (colorama.Fore.CYAN + word).join(tweet.split(word)) tweet = (word + colorama.Fore.RESET).join( tweet.split(word)) pprint(word_count, tweet)
def test_utils_pprint(capsys): # Fixture `capsys` allows stdout/stderr captures from konlpy.utils import pprint pprint([u"저는 소프트웨어 관련학과 입니다."]) out, err = capsys.readouterr() if sys.version_info[0] < 3: assert out == u"[저는 소프트웨어 관련학과 입니다.]\n" else: assert out == u"['저는 소프트웨어 관련학과 입니다.']\n"
def collocations_words(tagged_set): print('\nCollocations among words:') words = [w for w, t in tagged_set] ignored_words = [u'안녕'] finder = collocations.BigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words) finder.apply_freq_filter(3) # only bigrams that appear 3+ times pprint(finder.nbest(measures.pmi, 10))
def test_utils_pprint( capsys): # Fixture `capsys` allows stdout/stderr captures from konlpy.utils import pprint pprint([u"저는 소프트웨어 관련학과 입니다."]) out, err = capsys.readouterr() if sys.version_info[0] < 3: assert out == u"[저는 소프트웨어 관련학과 입니다.]\n" else: assert out == u"['저는 소프트웨어 관련학과 입니다.']\n"
def measure_accuracy(taggers, text): print '\n%s' % text result = [] for tagger in taggers: print tagger, r = tagging(tagger, text) pprint(r) result.append([tagger] + map(lambda s: ' / '.join(s), r)) return result
def measure_accuracy(taggers, text): print ('\n%s' % text) result = [] for tagger in taggers: print (tagger) r = tagging(tagger, text) pprint(r) result.append([tagger] + map(lambda s: ' / '.join(s), r)) return result
def replace_word(sentence): # split each sentence into chunks pprint(kkma.sentences(sentence)) # randomly select a word to replace # insert a new word generated by joint probability to a replaced position # return newly created sentence pass
def measure_accuracy(taggers, text): print('\n%s' % text) result = [] for tagger in taggers: print(tagger.__name__, ) r = tagger().pos(text) pprint(r) result.append([tagger.__name__] + list(map(lambda s: ' / '.join(s), r))) return result
def listdir(): """list konlpy default data directory. .. code-block:: python >>> import konlpy >>> konlpy.listdir() """ utils.pprint(os.listdir(DATA_DIR))
def analyze(self): count = 0 for item in self.con['bill'].find( {"$or": [{ "proposer": "이동섭" }, { "proposer": "유승민" }]}): #, {"analysisCheck":"0"}): count += 1 billid = item.get('bill_id') billname = item.get('bill_name') summary = item.get('summary') # komoran은 빈줄이 있으면 에러가 남 summary = summary.replace("\r", "").replace("\n", " ").replace("\t", " ") summary = summary.replace("?", "ㆍ").replace(",", "") print(count, "번째") print(billname) print(summary) # print(item.get('summary').replace("\n", " ").replace("?", "ㆍ")) # 명사 추출 nouns = self.komoran.nouns(summary) print("len(nouns) :", len(nouns)) cnt = Counter(nouns) result_list = cnt.most_common(len(nouns)) print("len(result_list) :", len(result_list)) # List 객체인 결과를 Dict로 변경 result_dict = {} # Dict 객체 생성 for i in range(len(result_list)): key = result_list[i][0] # 단어 value = result_list[i][1] # count result_dict[key] = value # pprint(result_dict) row = {} # Dict 객체 생성 row['bill_no'] = item.get('bill_no') row['politician_no'] = item.get('politician_no') row['words'] = result_dict pprint(row) self.con['billAnalysis'].insert_one(row) print("==========================================================") print("요시! 입력 완료!")
def display_books(self, recommend_list): #추천 책들을 출력 for row in recommend_list[:10]: bookinfo = self.collection.find_one({"ISBN": row[0]}, { "_id": 0, "RFM": 0, "ISBN": 0, "tag_percent": 0 }) print('=' * 50) pprint(bookinfo) print('Total TRFM: ' + str(row[1])) print('=' * 50)
def display_books(self, recommend_list, rel_list): # 추천 책들을 출력 rfm_total = 0 for i in xrange(10): isbn = recommend_list[i][0] rfm_total += self.rfm_dict[isbn] bookinfo = self.collection.find_one({"ISBN": recommend_list[i][0]}, {"_id": 0, "title": 1}) print('=' * 50) pprint(bookinfo) # print('ISBN:' + str(isbn)) print('Tag Point:' + str(rel_list[isbn])) print('Total TRFM: ' + str(recommend_list[i][1])) print('=' * 50) print('*'*5 + "Total RFM: " + str(rfm_total) + "*"*5)
def measure_time(taggers, mult=6): doc = kolaw.open('constitution.txt').read()*6 data = [[''] + taggers] for i in range(mult): doclen = 10**i times = [time()] diffs = [doclen] for tagger in taggers: r = tagging(tagger, doc[:doclen]) times.append(time()) diffs.append(times[-1] - times[-2]) print '%s\t%s\t%s' % (tagger[:5], doclen, diffs[-1]) pprint(r[:5]) data.append(diffs) print return data
def save_and_print(self): """collect current trending words and save or print""" counts, keywords = get_current_trend() if self.options.display_rank: for count, keyword in zip(counts, keywords): pair = "{}.{}".format(count, keyword) self.writer.write(pair) if self.options.verbose: pprint(pair) else: for keyword in keywords: self.writer.write(keyword) if self.options.verbose: pprint(keyword)
def measure_time(taggers, mult=6): doc = kolaw.open('constitution.txt').read()*6 data = [['n'] + taggers] for i in range(mult): doclen = 10**i times = [time()] diffs = [doclen] for tagger in taggers: r = tagging(tagger, doc[:doclen]) times.append(time()) diffs.append(times[-1] - times[-2]) print ('%s\t%s\t%s' % (tagger[:5], doclen, diffs[-1])) pprint(r[:5]) data.append(diffs) print return data
def Analysis(): pos_data = np.load('data/PosData.npy').tolist() temp = open('tmp.txt', 'w') nes = ne_chunk(pos_data, binary=False) pprint(nes) for p in pos_data: temp.write(p[0] + ' ' + p[1] + '\n') if p[0] == '김종대': pprint(p) #pprint(pos_data) url = open('data/url.txt').read() ''' values={'s':text.encode('utf8')} article = requests.get(url, params=values) print article ''' temp.close()
def crawler(query): currnet_searching_page = 1 have_more_page_to_search = True # today_yy_mm_dd = datetime.today().strftime("%Y.%m.%d") today_yy_mm_dd = '2020.01.22' analizeWords = AnalizeWords() while have_more_page_to_search: url = "https://search.naver.com/search.naver?&where=news&query=" + query + "&sm=tab_pge&sort=1&photo=0&field=0&reporter_article=&pd=3&ds=" + today_yy_mm_dd + "&de=" + today_yy_mm_dd + "&mynews=0&start=" + str(currnet_searching_page) + "&refresh_start=0" req = requests.get(url) cont = req.content soup = BeautifulSoup(cont, 'html.parser') # <a>태그에서 제목과 링크주소 추출 atags = soup.select('._sp_each_title') for atag in atags: title_text.append(atag.text) # 제목 link_text.append(atag['href']) # 링크주소 print('title: ', atag.text) print('link: ', atag['href']) # analizeWords.test(atag.text) kkma = Kkma() pprint(kkma.sentences(atag.text)) # 본문요약본 contents_lists = soup.select('ul.type01 dl') for contents_list in contents_lists: # print('==='*40) print(contents_list.text) contents_cleansing(contents_list) # 본문요약 정제화 for page in soup.select(".paging"): print('page: ', page.text) if "다음페이지" in page.text: print('currnet page: ', page) currnet_searching_page = currnet_searching_page + 10 else: have_more_page_to_search = False noresult = soup.select('.noresult_tab') if noresult: print('no result') break
def main(): kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.')) pprint(kkma.nouns(u'질문이나 건의사항은 깃헙 이슈 트래커에 남겨주세요.')) pprint(kkma.pos(u'오류보고는 실행환경, 에러메세지와 함께 설명을 최대한 상세히!^^')) engines = [Kkma(), Hannanum(), MeCab()] for e in engines: print e pprint(e.pos(s))
def get_trend(self, date): try: # Site's anti-bot policy may block crawling & you can consider gentle crawling time.sleep(self.options.interval) response = self.request_trend(date) pprint(response.status_code) response.raise_for_status() trend = self.parse_trend(response.content) return trend except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout): # if timeout occurs, retry return self.get_trend(date) except requests.exceptions.HTTPError: return None
def summary(result): if not self.options.metadata_to_dict: if self.options.verbose: pprint(Fore.CYAN + result['title'] + Fore.RESET) pprint(Fore.CYAN + Style.DIM + result['written_at'] + Style.RESET_ALL + Fore.RESET) pprint(result['body']) writer.write("@title:" + result['title']) writer.write("@written_at:" + result['written_at']) writer.write("@body:" + result['body']) else: if self.options.verbose: pprint(result) writer.write(result)
def stream(self): try: if self.is_async: self._thread = PropagatingThread(target=self.job) self._thread.start() self._thread.join() else: self.job() except RecursionError: return False except KeyboardInterrupt: pprint("User has interrupted.") return False except: if self.recursion < self.retry: pprint("Error has raised but continue to stream.") self.recursion += 1 self.stream() else: raise
def measure_time(taggers, mult=6): doc = kolaw.open('constitution.txt').read() * 6 doc = doc.replace('\n', ' ') data = [['n', 'load'] + [10**i for i in range(mult)]] times = [time()] for tagger in taggers: diffs = [tagger.__name__] inst = tagger() inst.pos("가") times.append(time()) diffs.append(times[-1] - times[-2]) print('%s\t로딩\t%gs' % (tagger.__name__, diffs[-1])) for i in range(mult): doclen = 10**i r = inst.pos(doc[:doclen]) times.append(time()) diffs.append(times[-1] - times[-2]) print('%s\t%d\t%gs\t(Result Len: %d)' % (tagger.__name__, doclen, diffs[-1], len(r))) pprint(r[:5]) data.append(diffs) print() return data
def summary(result): for content in result: if not self.options.metadata_to_dict: if self.options.verbose: pprint(Fore.CYAN + content['date'] + Fore.RESET) pprint(Fore.CYAN + Style.DIM + content['title'] + Style.RESET_ALL + Fore.RESET) pprint(Fore.CYAN + Style.DIM * 2 + content['traffic'] + Style.RESET_ALL + Fore.RESET) writer.write("@date:" + content['date']) writer.write("@title:" + content['title']) writer.write("@traffic:" + content['traffic']) else: output = '\t'.join([ content['date'], content['title'], content['traffic'] ]) if self.options.verbose: pprint(output) writer.write(output)
print np.mean(prices), np.median(prices) print u'주행거리' mileage = [int(get_attr(g, 'mileage').strip('km')) for g in DATAFILES] print np.mean(mileage), np.median(mileage) print u'색상' colors = [(get_attr(g, 'color').split('(')[0], get_attr(g, 'price')) for g in DATAFILES] colorcnt = Counter(idx(colors, 0)) colormap = sorted([( color, np.mean([i[1] for i in colors if i[0]==color]), dict(colorcnt)[color] ) for color in set([c[0] for c in colors])], key=lambda x: x[1]) pprint(colorcnt.most_common(5)) pprint(colormap) print u'연료' fuel = [get_attr(g, 'fuel') for g in DATAFILES] pprint(dict(Counter(fuel))) print u'기어' t = [get_attr(g, 'transmission') for g in DATAFILES] pprint(dict(Counter(t))) print u'연식' birthday = [parse_birthday(get_attr(g, 'birthday')) for g in DATAFILES] pprint(Counter(birthday).most_common(10)) pprint(Counter(b.year for b in birthday)) pprint(Counter(b.month for b in birthday).most_common(12))
#-*- coding: utf-8 -*- __author__ = 'KC' from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.'))
keywords_ext.document_frequency = (2, 0.9) c_dict = _read_compound_words_dictionary() f = open("../data/news_data/news0.txt", "r") text = f.readlines() #text = unicode(text) f.close() r_text = [] for t in text: temp_t = t.replace("\n", "") if len(temp_t) > 0: r_text.append(temp_t) keywords = keywords_ext.extraction(_sents=r_text, _compound_words_dictionary=c_dict) print('\n--------- Tokens after tagging ------------') pprint(keywords_ext.read_tokens(0)) print('\n--------- End of tokens ------- ------------') print('\n--------- Keywords ----------------------') pprint(keywords) print('\n--------- End of keywords ------------------') col = collocation_ext.extraction(keywords_ext.read_tokens(), 2) print('\n--------- Collocation ----------------------') for k, v in col.iteritems(): print "", k, "=> ", ", ".join(v) print('\n--------- End of collocation ---------------\n') #pprint(col)
from konlpy.tag import Twitter from konlpy.corpus import kolaw from konlpy.utils import pprint from nltk import collocations measures = collocations.BigramAssocMeasures() doc = "턴 에는 청소년 문제뿐만이 아닌 여러가지 복합적인 사회문제들이 간접적으로 제시되어있었다. 프로 작가들이 풀어내는 이야기들이라 첫 장을 펴면서 저도 모르게 이상한 나라의 앨리스처럼 끝장까지 스르륵 빠져든다는 게 이 책의 매력이다. 내가 가장 걱정하고 신경썼던 부분은 소율과 지아의 신경전, 그로인해 행해지는 사이버폭력이다. '혀 밑에 칼이 있다.' 라는 속담이있다. 하지만 이것도 옛 말이다. 클릭 한 번에 칼날이 비수로 꽂힌다 라는 말이 새로 생겨야한다. 누군가를 향한 정확하지 않은 사실에 대한 마구잡이 유포, 질투를 가장한 언어폭력은 현재 내가 이 글을 쓰고 있는 현실에서도 수없이 행해지는 사이버폭력들이다. 마음 연약한, 연한 녹색 새싹같은 아이들의 마음에 생채기를 내는 이런 사이버폭력들을 막으려면 어떻게 해야할까...? 다른 부분들도 의미깊고 재미있었지만 나는 이 부분이 가장 걱정되고, 개선하고 싶은 부분이라 생각한다. 강력한 인터넷 실명제를 도입하여 아예 사진까지도입하여 제 얼굴을 걸고 댓글을 쓰도록 만들어야하지 않을까 생각한다.생각없는 클릭질에 가슴에 피가 철철 흐르지 않도록 그렇게 되어야 한다....! 세용의 이야기도 의미깊고 기대가 되었다. 타인에 의해 이루어지는 변화는 어디까지일까. 한 사람으로 인해 변할 수 있는 세용의 넓이와 깊이가 궁금하다. 청소년연작소설 [뜨인돌] 턴 정말 매력있고 생각할 거리를 주는 좋은 청소년 소설이었다. 내가 한 행동은 미래에 누군가가 나에게 할 수 있는 행동이다. 모든 것은 연결되어있다. 좋은 것은 좋은 것과, 혹은 나쁜 것과..그렇지만 나쁜것과 연결된 것은 전에 내가 했던 좋았던 행동으로 상쇄시킬 수 있다. 뫼비우스의 띠처럼 우리는 같은, 이어진 고리를 끊임없이 순환하는 것은 아닐까. 간만에 독서 열심히 한 것 같다. 지루하지 않게, 생감있게, 현실을 바로 볼 수 있을 수 있었던 좋은 책이었다." print('\nCollocations among tagged words:') tagged_words = Twitter().pos(doc) finder = collocations.BigramCollocationFinder.from_words(tagged_words) print(finder.nbest(measures.pmi, 10)) # top 5 n-grams with highest PMI print('\nCollocations among words:') words = [w for w, t in tagged_words] ignored_words = [u'안녕'] finder = collocations.BigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words) finder.apply_freq_filter(3) # only bigrams that appear 3+ times pprint(finder.nbest(measures.pmi, 10)) print('\nCollocations among tags:') tags = [t for w, t in tagged_words] finder = collocations.BigramCollocationFinder.from_words(tags) pprint(finder.nbest(measures.pmi, 5))
__author__ = 'woojin' # -*- coding: utf-8 -*- from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.')) pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.')) pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
import os from konlpy.tag import Kkma from konlpy.utils import pprint from konlpy.tag import Okt nlpy = Okt() for filename in os.listdir("test"): with open(os.path.join("test", filename), 'r', encoding="utf-8") as f: # open in readonly mode print("file :" + filename) sentences = f.read() nouns = nlpy.nouns(sentences) counter = {} for noun in nouns: if noun not in counter: counter[noun] = 1 else: counter[noun] += 1 sorter = [(value, key) for key, value in counter.items()] sorter.sort() sorter.reverse() pprint(sorter)
def __init__(self, dirname=DATA_DIR, word_list=ALPHABET, is_async=True): super(TwitterStreamer, self).__init__(is_async=is_async) self.is_async = is_async parser = self.get_parser() parser.add_argument( '--consumer_key', help='consumer key', ) parser.add_argument( '--consumer_secret', help='consumer secret', ) parser.add_argument( '--access_token', help='access token', ) parser.add_argument( '--access_token_secret', help='access token secret', ) parser.add_argument( '--filter_retweets', help='do not save potentially repetitive retweets', action="store_true", ) parser.add_argument( '--remove_links', help='remove links included into each tweet', action="store_true", ) parser.add_argument( '--remove_mentions', help='remove mentions included into each tweet', action="store_true", ) parser.add_argument( '--output_prefix', help='prefix of the output file', default='tweet', type=str ) parser.add_argument( '--output_as_onefile', help='save output as onefile', action="store_true", ) parser.add_argument( '--output_extension', help='extension of the output file', default='txt', type=str ) parser.add_argument( '--tweet_limits', help='stop when this amount of tweets are collected', default=1000000, type=int ) parser.add_argument( '--time_limits', help='stop when n secs elapsed', default=1000000, type=int ) parser.add_argument( '--keyword_file', help='file that defines a keywords line by line', type=str ) self.options, _ = parser.parse_known_args() # lazy requirement checking since argparse's required option blocks initialization. requirements = [self.options.consumer_key, self.options.consumer_secret, self.options.access_token, self.options.access_token_secret] flag = None for requirement in requirements: if not requirement: flag = 1 if flag is not None: pprint("You have to provide valid consumer key, consumer_secret, access_token, access_token_secret.") # Parse wordlist from custom argument self.dirname = dirname if self.options.keyword_file is not None: try: reader = open(self.options.keyword_file, mode='r+', encoding='utf-8') except UnicodeDecodeError: reader = open(self.options.keyword_file, mode='r+', encoding='cp949') self.word_list = reader.readlines() else: self.word_list = word_list self.is_async = is_async self.streamer = None
def dedup(l): return list(unique_everseen(l)) def concate_tuple(t): return '%s_%s' % t if __name__=='__main__': f = open('./dummy_article_03.txt') lines = convert_textfile_to_lines(f) # sentences = do_sentencing_by_threading(lines) # sentences = do_sentencing_without_threading(lines) sentences = '[ 천지 일보= 임 문식 기자] 4.13 총선을 나흘 남겨 둔 9일 여야 정치권이 20대 총선 성패를 좌우할 수도권 등 전국 곳곳에서 표 심 잡기에 집중, 사활을 건 유세전을 펼쳤다.' print "sentences" pprint(sentences) # morphs = do_parsing_by_threading(sentences) morphs = do_parsing_without_threading(sentences) print "morphs" pprint(morphs) # morphs = [('a','1'), ('b','2')] morphs = map(concate_tuple, morphs) pprint(morphs)
from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다')); pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.')); pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
# -*- coding: utf-8 -*- from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'저는 대학생이구요. 소프트웨어 관련학과 입니다.')) [저는 대학생이구요., 소프트웨어 관련학과 입니다.]
# -*- coding: utf-8 -*- from collections import Counter from konlpy.corpus import kolaw from konlpy.tag import Hannanum from konlpy.utils import concordance, pprint from matplotlib import pyplot from nltk.corpus import stopwords tmppos=[] stopwords = [u"위원님", u"위원", u"것", u"말씀", u"우리", u"그것", u"이것", u"존경", u"이", u"저", u"저희", u"때문", u"문제", u"생각", u"미래부", u"장관님", u"거", u"때", u"다음", u"일", u"수", u"정부", u"정책"] doc = unicode(open(u'회의록.txt').read(), 'utf-8') pos = Hannanum().pos(doc) for word in pos: if word[1] == 'N': tmppos.append(word) for word in tmppos: for stopword in stopwords: if word[0] == stopword: tmppos.remove(word) cnt = Counter(tmppos) f = open("freq.txt", "wb") for freq_word in cnt.most_common(50): f.write(freq_word[0][0].encode('utf-8')+"\n") f.close() print('\nTop 20 frequent morphemes:'); pprint(cnt.most_common(50))
def get_trend(self): _, self.trend = get_current_trend() if self.options.verbose: pprint(self.trend)
print '------------------------------------------------------------------------------------' print '-------------------- Training dataset ----------------------------------------------' documents = [ u"배송지연으로 인한 환불정책을 알고 싶습니다. 되기는 되는 것인가요", u"배송이 완료되었습니다. 고객님", u"해당 상품 두가지는 일괄 배송이 됩니다", u"고객님 제가 알아보고 다시 연락드리겠습니다", u"상담원 홍길동이었습니다", u"내일까지 배송될 예정입니다. 조금만 기다려주십시오", u"상품 파손시 환불이 불가능합니다", u"상품 수거 후 환불완료가 될 것입니다. 내일 방문하는 택배원에게 상품을 보내주세요", u"지금 집에 없는데 경비실에 맡겨주세요", u"아직도 배송이 안되었는데 언제 배송되나요 연락도 없구요", u"고객님이 주문하신 상품이 품절이 되었습니다. 결제를 취소처리하려 합니다", u"고객님 품절된 상품 대신 다른 상품으로 대체 발송하려 하는데 동의하시나요", u"배송기사가 상자를 던져서 상품이 파손되었습니다. 환불을 해주시던지 다른 상품을 보내주세요", u"배송 중 파손이 된 것 같은데요. 파손 보상책이 준비되어 있나요" ] pprint(documents) #stoplist = [u"",] vectorizer = TfidfVectorizer(min_df=2, analyzer='word', tokenizer=tokenizer) tfidf = vectorizer.fit_transform(documents) print '\n' print '-------------------- Keywords -------------------------------------------------------' pprint(vectorizer.vocabulary_) print '-------------------- End of keywords ------------------------------------------------' print '\n' new_document = [] #new_document.append(u"배송 중 상자가 파손되었어요") new_document.append(sys.argv[1])
def __init__(self, dirname=DATA_DIR, word_list=ALPHABET, is_async=True): super(TwitterStreamer, self).__init__(is_async=is_async) self.is_async = is_async parser = self.get_parser() parser.add_argument( '--consumer_key', help='consumer key', ) parser.add_argument( '--consumer_secret', help='consumer secret', ) parser.add_argument( '--access_token', help='access token', ) parser.add_argument( '--access_token_secret', help='access token secret', ) parser.add_argument( '--filter_retweets', help='do not save potentially repetitive retweets', action="store_true", ) parser.add_argument( '--remove_links', help='remove links included into each tweet', action="store_true", ) parser.add_argument( '--remove_mentions', help='remove mentions included into each tweet', action="store_true", ) parser.add_argument('--output_prefix', help='prefix of the output file', default='tweet', type=str) parser.add_argument( '--output_as_onefile', help='save output as onefile', action="store_true", ) parser.add_argument('--output_extension', help='extension of the output file', default='txt', type=str) parser.add_argument( '--tweet_limits', help='stop when this amount of tweets are collected', default=1000000, type=int) parser.add_argument('--time_limits', help='stop when n secs elapsed', default=1000000, type=int) parser.add_argument('--keyword_file', help='file that defines a keywords line by line', type=str) self.options, _ = parser.parse_known_args() # lazy requirement checking since argparse's required option blocks initialization. requirements = [ self.options.consumer_key, self.options.consumer_secret, self.options.access_token, self.options.access_token_secret ] flag = None for requirement in requirements: if not requirement: flag = 1 if flag is not None: pprint( "You have to provide valid consumer key, consumer_secret, access_token, access_token_secret." ) # Parse wordlist from custom argument self.dirname = dirname if self.options.keyword_file is not None: try: reader = open(self.options.keyword_file, mode='r+', encoding='utf-8') except UnicodeDecodeError: reader = open(self.options.keyword_file, mode='r+', encoding='cp949') self.word_list = reader.readlines() else: self.word_list = word_list self.is_async = is_async self.streamer = None
from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.' pprint(kkma.nouns(string))
def test_utils_pprint(capsys): # Fixture `capsys` allows stdout/stderr captures from konlpy.utils import pprint pprint([u"저는 소프트웨어 관련학과 입니다."]) out, err = capsys.readouterr() assert out == u"[저는 소프트웨어 관련학과 입니다.]\n"
''' Created on 2016. 11. 2. 형태소 ''' from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences('여러분, 안녕하세요. 반갑습니다.')) print() pprint( kkma.nouns( u'오늘 폭염이 주춤했지만 일부지방은 폭염 특보 속에 35도 안팎의 찜통더위가 기승을 부렸는데요.자세한 날씨, YTN 중계차 연결해 알아보겠습니다.' )) print() pprint(kkma.pos(u'오류보고는 실행환경, 에러메세지와 함께'))
def get_nouns_duplicate(text): mecab = Mecab() arr = mecab.nouns(text) return arr def get_nouns(text, chunk=500, mfv=20): mecab = Mecab() arr = mecab.nouns(text) return list(set(arr)) # for i in range(len(arr)/chunk): # nngs = [] # for j in range(chunk): # if arr[i*chunk + j][1] in ['NNG', 'NNP']: # nngs.append(arr[i*chunk + j][0]) # count = Counter(nngs) # brr = count.most_common(mfv) # pprint(brr) # return brr if __name__ == "__main__": data_name = '소나기' text = read_text(data_name) nouns = get_nouns(text) pprint(nouns)
#! /usr/bin/python2.7 # -*- coding: utf-8 -*- from konlpy.tag import Kkma from konlpy.corpus import kolaw from konlpy.utils import pprint from nltk import collocations measures = collocations.BigramAssocMeasures() doc = kolaw.open('constitution.txt').read() print('\nCollocations among tagged words:') tagged_words = Kkma().pos(doc) finder = collocations.BigramCollocationFinder.from_words(tagged_words) pprint(finder.nbest(measures.pmi, 10)) # top 5 n-grams with highest PMI print('\nCollocations among words:') words = [w for w, t in tagged_words] ignored_words = [u'안녕'] finder = collocations.BigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words) finder.apply_freq_filter(3) # only bigrams that appear 3+ times pprint(finder.nbest(measures.pmi, 10)) print('\nCollocations among tags:') tags = [t for w, t in tagged_words] finder = collocations.BigramCollocationFinder.from_words(tags) pprint(finder.nbest(measures.pmi, 5))
context = zmq.Context() socket = context.socket(zmq.REP) socket.bind('tcp://127.0.0.1:%s' % port) while True: print 'in the loop' # Wait for next request from client message = socket.recv() result = kkma.nouns(message); result = ', '.join(result) print '------' print result socket.send_string(result) # for socker.end unicode is not allowed use send_string """ string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.' string2 = u'5학년되니까 학교근처엔 도저히 먹을게없다' string3 = u'카이리스님께 사과문올립니다' start = time.time() pprint(kkma.nouns(string)) pprint(kkma.nouns(string2)) pprint(kkma.nouns(string3)) end = time.time() print (end - start) """