Exemple #1
1
from collections import Counter

from konlpy.corpus import kolaw
from konlpy.tag import Hannanum
from konlpy.plot import draw_zipf
from konlpy.utils import concordance, pprint
from matplotlib import pyplot


def draw_zipf(count_list, filename, color='blue', marker='o'):
    sorted_list = sorted(count_list, reverse=True)
    pyplot.plot(sorted_list, color=color, marker=marker)
    pyplot.xscale('log')
    pyplot.yscale('log')
    pyplot.savefig(filename)


doc = kolaw.open('constitution.txt').read()
pos = Hannanum().pos(doc)
cnt = Counter(pos)

print 'nchars  :', len(doc)
print 'ntokens :', len(doc.split())
print 'nmorphs :', len(set(pos))
print '\nTop 20 frequent morphemes:'; pprint(cnt.most_common(20))
print '\nLocations of "대한민국" in the document:'
concordance(u'대한민국', doc, show=True)

draw_zipf([b for a, b in cnt.items()], 'zipf.png')
Exemple #2
0
def Pumsa(words):
    pprint(words)
    first = words[0][1]
    last = words[len(words) - 1][1] #단어를 형태소 단위로 분리했을 때 마지막 품사
    ans = 0

    #S
    if (last in ['jcs', 'jxc']):
        ans = 1

    #V
    elif (last == 'ef'):
        ans = 2
        
    elif (first in ['pvd','pvg']):
        ans = 2

    #O
    elif (last == 'jco'):
        ans = 3

    #C
    elif (last == 'jcc'):
        ans = 4
	
	#A
    elif (last == 'jcm'):
	    ans = 5

    return ans
def get_text(url):
    source_code_from_URL = urllib.request.urlopen(url)

    # BeautifulSoup개체 : URL을 2번째 매개변수 기준으로 파싱. 'lxml'은 xml 라이브러리. 내장 파서인 html.parser도 가능
    soup = BeautifulSoup(source_code_from_URL, 'lxml', from_encoding='utf-8')
    text = ''

    for title in soup.find_all('h3',{'id':'articleTitle'}):
        children = title.children
        for a in children:
            if type(a) == bs4.element.NavigableString:
                if len(a) > 1:
                    text = text + a + '\n\n\n'
                    print(a)
                    pprint(twt.pos(a))

    text = text.replace(u'\xa0', u'')

    for item in soup.find_all('div', {'id': 'articleBodyContents'}):
        # print(item) #이 선택자의 html 나옴
        children = item.children
        for a in children:
            if type(a) == bs4.element.NavigableString:
                if len(a) > 1:
                    text = text + a + '\n'
                    print(a)
                    pprint(twt.pos(a))
        for pic in item.find_all('span', {'class': 'end_photo_org'}):
            print(pic.img)
    text = text.replace(u'\xa0', u'')
    return text
Exemple #4
0
        def write_tweets_to_files(tweet):
            if self.options.remove_links:
                tweet = delete_links(tweet)
            if self.options.remove_mentions:
                tweet = delete_mentions(tweet)

            word_count = 0

            if not self.options.output_as_onefile:
                # counts how many targeting words included in one tweet.
                for word in self.words:
                    word_count += tweet.count(word)

            filename = "{}{}{}.{}".format(
                self.dirname,
                self.options.output_prefix,
                word_count,
                self.options.output_extension
            )

            n_word_file = io.open(filename, 'a', encoding='utf-8')
            n_word_file.write(tweet)
            n_word_file.write("\n")

            if self.options.verbose:
                for word in self.words:
                    tweet = (colorama.Fore.CYAN + word).join(tweet.split(word))
                    tweet = (word + colorama.Fore.RESET).join(tweet.split(word))
                pprint(word_count, tweet)
Exemple #5
0
        def write_tweets_to_files(tweet):
            if self.options.remove_links:
                tweet = delete_links(tweet)
            if self.options.remove_mentions:
                tweet = delete_mentions(tweet)

            word_count = 0

            if not self.options.output_as_onefile:
                # counts how many targeting words included in one tweet.
                for word in self.words:
                    word_count += tweet.count(word)

            filename = "{}{}{}.{}".format(self.dirname,
                                          self.options.output_prefix,
                                          word_count,
                                          self.options.output_extension)

            n_word_file = io.open(filename, 'a', encoding='utf-8')
            n_word_file.write(tweet)
            n_word_file.write("\n")

            if self.options.verbose:
                for word in self.words:
                    tweet = (colorama.Fore.CYAN + word).join(tweet.split(word))
                    tweet = (word + colorama.Fore.RESET).join(
                        tweet.split(word))
                pprint(word_count, tweet)
Exemple #6
0
def test_utils_pprint(capsys): # Fixture `capsys` allows stdout/stderr captures
    from konlpy.utils import pprint
    pprint([u"저는 소프트웨어 관련학과 입니다."])
    out, err = capsys.readouterr()
    if sys.version_info[0] < 3:
        assert out == u"[저는 소프트웨어 관련학과 입니다.]\n"
    else:
        assert out == u"['저는 소프트웨어 관련학과 입니다.']\n"
def collocations_words(tagged_set):
    print('\nCollocations among words:')
    words = [w for w, t in tagged_set]
    ignored_words = [u'안녕']
    finder = collocations.BigramCollocationFinder.from_words(words)
    finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words)
    finder.apply_freq_filter(3)  # only bigrams that appear 3+ times
    pprint(finder.nbest(measures.pmi, 10))
Exemple #8
0
def test_utils_pprint(
        capsys):  # Fixture `capsys` allows stdout/stderr captures
    from konlpy.utils import pprint
    pprint([u"저는 소프트웨어 관련학과 입니다."])
    out, err = capsys.readouterr()
    if sys.version_info[0] < 3:
        assert out == u"[저는 소프트웨어 관련학과 입니다.]\n"
    else:
        assert out == u"['저는 소프트웨어 관련학과 입니다.']\n"
Exemple #9
0
def measure_accuracy(taggers, text):
    print '\n%s' % text
    result = []
    for tagger in taggers:
        print tagger,
        r = tagging(tagger, text)
        pprint(r)
        result.append([tagger] + map(lambda s: ' / '.join(s), r))
    return result
def measure_accuracy(taggers, text):
    print ('\n%s' % text)
    result = []
    for tagger in taggers:
        print (tagger) 
        r = tagging(tagger, text)
        pprint(r)
        result.append([tagger] + map(lambda s: ' / '.join(s), r))
    return result
Exemple #11
0
def replace_word(sentence):
    # split each sentence into chunks
    pprint(kkma.sentences(sentence))

    # randomly select a word to replace

    # insert a new word generated by joint probability to a replaced position

    # return newly created sentence
    pass
Exemple #12
0
def measure_accuracy(taggers, text):
    print('\n%s' % text)
    result = []
    for tagger in taggers:
        print(tagger.__name__, )
        r = tagger().pos(text)
        pprint(r)
        result.append([tagger.__name__] +
                      list(map(lambda s: ' / '.join(s), r)))
    return result
Exemple #13
0
def listdir():
    """list konlpy default data directory.

    .. code-block:: python

        >>> import konlpy
        >>> konlpy.listdir()

    """

    utils.pprint(os.listdir(DATA_DIR))
def listdir():
    """list konlpy default data directory.

    .. code-block:: python

        >>> import konlpy
        >>> konlpy.listdir()

    """

    utils.pprint(os.listdir(DATA_DIR))
Exemple #15
0
    def analyze(self):
        count = 0
        for item in self.con['bill'].find(
            {"$or": [{
                "proposer": "이동섭"
            }, {
                "proposer": "유승민"
            }]}):  #, {"analysisCheck":"0"}):
            count += 1
            billid = item.get('bill_id')
            billname = item.get('bill_name')
            summary = item.get('summary')

            # komoran은 빈줄이 있으면 에러가 남
            summary = summary.replace("\r",
                                      "").replace("\n",
                                                  " ").replace("\t", " ")
            summary = summary.replace("?", "ㆍ").replace(",", "")

            print(count, "번째")
            print(billname)
            print(summary)
            # print(item.get('summary').replace("\n", " ").replace("?", "ㆍ"))

            # 명사 추출
            nouns = self.komoran.nouns(summary)
            print("len(nouns) :", len(nouns))

            cnt = Counter(nouns)
            result_list = cnt.most_common(len(nouns))
            print("len(result_list) :", len(result_list))

            # List 객체인 결과를 Dict로 변경
            result_dict = {}  # Dict 객체 생성
            for i in range(len(result_list)):
                key = result_list[i][0]  # 단어
                value = result_list[i][1]  # count
                result_dict[key] = value


#             pprint(result_dict)

            row = {}  # Dict 객체 생성
            row['bill_no'] = item.get('bill_no')
            row['politician_no'] = item.get('politician_no')
            row['words'] = result_dict

            pprint(row)

            self.con['billAnalysis'].insert_one(row)
            print("==========================================================")

        print("요시! 입력 완료!")
Exemple #16
0
    def display_books(self, recommend_list):  #추천 책들을 출력
        for row in recommend_list[:10]:
            bookinfo = self.collection.find_one({"ISBN": row[0]}, {
                "_id": 0,
                "RFM": 0,
                "ISBN": 0,
                "tag_percent": 0
            })
            print('=' * 50)
            pprint(bookinfo)
            print('Total TRFM: ' + str(row[1]))

        print('=' * 50)
Exemple #17
0
    def display_books(self, recommend_list, rel_list):  # 추천 책들을 출력
        rfm_total = 0

        for i in xrange(10):
            isbn = recommend_list[i][0]
            rfm_total += self.rfm_dict[isbn]
            bookinfo = self.collection.find_one({"ISBN": recommend_list[i][0]}, {"_id": 0, "title": 1})
            print('=' * 50)
            pprint(bookinfo)
            # print('ISBN:' + str(isbn))
            print('Tag Point:' + str(rel_list[isbn]))
            print('Total TRFM: ' + str(recommend_list[i][1]))

        print('=' * 50)
        print('*'*5 + "Total RFM: " + str(rfm_total) + "*"*5)
Exemple #18
0
def measure_time(taggers, mult=6):
    doc = kolaw.open('constitution.txt').read()*6
    data = [[''] + taggers]
    for i in range(mult):
        doclen = 10**i
        times = [time()]
        diffs = [doclen]
        for tagger in taggers:
            r = tagging(tagger, doc[:doclen])
            times.append(time())
            diffs.append(times[-1] - times[-2])
            print '%s\t%s\t%s' % (tagger[:5], doclen, diffs[-1])
            pprint(r[:5])
        data.append(diffs)
        print
    return data
Exemple #19
0
    def save_and_print(self):
        """collect current trending words and save or print"""

        counts, keywords = get_current_trend()
        if self.options.display_rank:
            for count, keyword in zip(counts, keywords):
                pair = "{}.{}".format(count, keyword)
                self.writer.write(pair)
                if self.options.verbose:
                    pprint(pair)

        else:
            for keyword in keywords:
                self.writer.write(keyword)
                if self.options.verbose:
                    pprint(keyword)
Exemple #20
0
    def save_and_print(self):
        """collect current trending words and save or print"""

        counts, keywords = get_current_trend()
        if self.options.display_rank:
            for count, keyword in zip(counts, keywords):
                pair = "{}.{}".format(count, keyword)
                self.writer.write(pair)
                if self.options.verbose:
                    pprint(pair)

        else:
            for keyword in keywords:
                self.writer.write(keyword)
                if self.options.verbose:
                    pprint(keyword)
def measure_time(taggers, mult=6):
    doc = kolaw.open('constitution.txt').read()*6
    data = [['n'] + taggers]
    for i in range(mult):
        doclen = 10**i
        times = [time()]
        diffs = [doclen]
        for tagger in taggers:
            r = tagging(tagger, doc[:doclen])
            times.append(time())
            diffs.append(times[-1] - times[-2])
            print ('%s\t%s\t%s' % (tagger[:5], doclen, diffs[-1]))
            pprint(r[:5])
        data.append(diffs)
        print
    return data
def Analysis():
    pos_data = np.load('data/PosData.npy').tolist()
    temp = open('tmp.txt', 'w')
    nes = ne_chunk(pos_data, binary=False)
    pprint(nes)
    for p in pos_data:
        temp.write(p[0] + ' ' + p[1] + '\n')
        if p[0] == '김종대': pprint(p)
    #pprint(pos_data)
    url = open('data/url.txt').read()
    '''
	values={'s':text.encode('utf8')}
	article = requests.get(url, params=values)
	print article
	'''
    temp.close()
Exemple #23
0
def crawler(query):
    currnet_searching_page = 1
    have_more_page_to_search = True
    # today_yy_mm_dd = datetime.today().strftime("%Y.%m.%d")
    today_yy_mm_dd = '2020.01.22'

    analizeWords = AnalizeWords()

    while have_more_page_to_search:
        url = "https://search.naver.com/search.naver?&where=news&query=" + query + "&sm=tab_pge&sort=1&photo=0&field=0&reporter_article=&pd=3&ds=" + today_yy_mm_dd + "&de=" + today_yy_mm_dd + "&mynews=0&start=" + str(currnet_searching_page) + "&refresh_start=0"

        req = requests.get(url)
        cont = req.content
        soup = BeautifulSoup(cont, 'html.parser')

        # <a>태그에서 제목과 링크주소 추출
        atags = soup.select('._sp_each_title')
        for atag in atags:
            title_text.append(atag.text)  # 제목
            link_text.append(atag['href'])  # 링크주소
            print('title: ', atag.text)
            print('link: ', atag['href'])
            # analizeWords.test(atag.text)

            kkma = Kkma()
            pprint(kkma.sentences(atag.text))

        # 본문요약본
        contents_lists = soup.select('ul.type01 dl')
        for contents_list in contents_lists:
            # print('==='*40)
            print(contents_list.text)
            contents_cleansing(contents_list)  # 본문요약 정제화

        for page in soup.select(".paging"):
            print('page: ', page.text)
            if "다음페이지" in page.text:
                print('currnet page: ', page)
                currnet_searching_page = currnet_searching_page + 10
            else:
                have_more_page_to_search = False

        noresult = soup.select('.noresult_tab')

        if noresult:
            print('no result')
            break
def main():
    kkma = Kkma()
    pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.'))
    pprint(kkma.nouns(u'질문이나 건의사항은 깃헙 이슈 트래커에 남겨주세요.'))
    pprint(kkma.pos(u'오류보고는 실행환경, 에러메세지와 함께 설명을 최대한 상세히!^^'))

    engines = [Kkma(), Hannanum(), MeCab()]
    for e in engines:
        print e
        pprint(e.pos(s))
Exemple #25
0
    def get_trend(self, date):
        try:
            # Site's anti-bot policy may block crawling & you can consider gentle crawling
            time.sleep(self.options.interval)
            response = self.request_trend(date)
            pprint(response.status_code)
            response.raise_for_status()
            trend = self.parse_trend(response.content)

            return trend

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectTimeout):
            # if timeout occurs, retry
            return self.get_trend(date)

        except requests.exceptions.HTTPError:
            return None
Exemple #26
0
 def summary(result):
     if not self.options.metadata_to_dict:
         if self.options.verbose:
             pprint(Fore.CYAN + result['title'] + Fore.RESET)
             pprint(Fore.CYAN + Style.DIM + result['written_at'] + Style.RESET_ALL + Fore.RESET)
             pprint(result['body'])
         writer.write("@title:" + result['title'])
         writer.write("@written_at:" + result['written_at'])
         writer.write("@body:" + result['body'])
     else:
         if self.options.verbose:
             pprint(result)
         writer.write(result)
 def stream(self):
     try:
         if self.is_async:
             self._thread = PropagatingThread(target=self.job)
             self._thread.start()
             self._thread.join()
         else:
             self.job()
     except RecursionError:
         return False
     except KeyboardInterrupt:
         pprint("User has interrupted.")
         return False
     except:
         if self.recursion < self.retry:
             pprint("Error has raised but continue to stream.")
             self.recursion += 1
             self.stream()
         else:
             raise
Exemple #28
0
 def summary(result):
     if not self.options.metadata_to_dict:
         if self.options.verbose:
             pprint(Fore.CYAN + result['title'] + Fore.RESET)
             pprint(Fore.CYAN + Style.DIM + result['written_at'] +
                    Style.RESET_ALL + Fore.RESET)
             pprint(result['body'])
         writer.write("@title:" + result['title'])
         writer.write("@written_at:" + result['written_at'])
         writer.write("@body:" + result['body'])
     else:
         if self.options.verbose:
             pprint(result)
         writer.write(result)
Exemple #29
0
def measure_time(taggers, mult=6):
    doc = kolaw.open('constitution.txt').read() * 6
    doc = doc.replace('\n', ' ')
    data = [['n', 'load'] + [10**i for i in range(mult)]]
    times = [time()]
    for tagger in taggers:
        diffs = [tagger.__name__]
        inst = tagger()
        inst.pos("가")
        times.append(time())
        diffs.append(times[-1] - times[-2])
        print('%s\t로딩\t%gs' % (tagger.__name__, diffs[-1]))
        for i in range(mult):
            doclen = 10**i
            r = inst.pos(doc[:doclen])
            times.append(time())
            diffs.append(times[-1] - times[-2])
            print('%s\t%d\t%gs\t(Result Len: %d)' %
                  (tagger.__name__, doclen, diffs[-1], len(r)))
            pprint(r[:5])
        data.append(diffs)
        print()
    return data
Exemple #30
0
 def summary(result):
     for content in result:
         if not self.options.metadata_to_dict:
             if self.options.verbose:
                 pprint(Fore.CYAN + content['date'] + Fore.RESET)
                 pprint(Fore.CYAN + Style.DIM + content['title'] +
                        Style.RESET_ALL + Fore.RESET)
                 pprint(Fore.CYAN + Style.DIM * 2 + content['traffic'] +
                        Style.RESET_ALL + Fore.RESET)
             writer.write("@date:" + content['date'])
             writer.write("@title:" + content['title'])
             writer.write("@traffic:" + content['traffic'])
         else:
             output = '\t'.join([
                 content['date'], content['title'], content['traffic']
             ])
             if self.options.verbose:
                 pprint(output)
             writer.write(output)
Exemple #31
0
print np.mean(prices), np.median(prices)

print u'주행거리'
mileage = [int(get_attr(g, 'mileage').strip('km')) for g in DATAFILES]
print np.mean(mileage), np.median(mileage)


print u'색상'
colors = [(get_attr(g, 'color').split('(')[0], get_attr(g, 'price')) for g in DATAFILES]
colorcnt = Counter(idx(colors, 0))
colormap = sorted([(
    color,
    np.mean([i[1] for i in colors if i[0]==color]),
    dict(colorcnt)[color]
    ) for color in set([c[0] for c in colors])], key=lambda x: x[1])
pprint(colorcnt.most_common(5))
pprint(colormap)

print u'연료'
fuel = [get_attr(g, 'fuel') for g in DATAFILES]
pprint(dict(Counter(fuel)))

print u'기어'
t = [get_attr(g, 'transmission') for g in DATAFILES]
pprint(dict(Counter(t)))

print u'연식'
birthday = [parse_birthday(get_attr(g, 'birthday')) for g in DATAFILES]
pprint(Counter(birthday).most_common(10))
pprint(Counter(b.year for b in birthday))
pprint(Counter(b.month for b in birthday).most_common(12))
Exemple #32
0
#-*- coding: utf-8 -*-
__author__ = 'KC'

from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()

pprint(kkma.sentences(u'네, 안녕하세요. 반갑습니다.'))
Exemple #33
0
keywords_ext.document_frequency = (2, 0.9)

c_dict = _read_compound_words_dictionary()

f = open("../data/news_data/news0.txt", "r")
text = f.readlines()
#text = unicode(text)
f.close()
r_text = []
for t in text:
    temp_t = t.replace("\n", "")
    if len(temp_t) > 0:
        r_text.append(temp_t)

keywords = keywords_ext.extraction(_sents=r_text,
                                   _compound_words_dictionary=c_dict)
print('\n--------- Tokens after tagging  ------------')
pprint(keywords_ext.read_tokens(0))
print('\n--------- End of tokens ------- ------------')
print('\n--------- Keywords    ----------------------')
pprint(keywords)
print('\n--------- End of keywords ------------------')

col = collocation_ext.extraction(keywords_ext.read_tokens(), 2)
print('\n--------- Collocation ----------------------')
for k, v in col.iteritems():
    print "", k, "=> ", ", ".join(v)
print('\n--------- End of collocation ---------------\n')

#pprint(col)
Exemple #34
0
from konlpy.tag import Twitter
from konlpy.corpus import kolaw
from konlpy.utils import pprint
from nltk import collocations


measures = collocations.BigramAssocMeasures()
doc = "턴 에는 청소년 문제뿐만이 아닌 여러가지 복합적인 사회문제들이 간접적으로 제시되어있었다.  프로 작가들이 풀어내는 이야기들이라 첫 장을 펴면서 저도 모르게 이상한 나라의 앨리스처럼 끝장까지 스르륵 빠져든다는 게 이 책의 매력이다. 내가 가장 걱정하고 신경썼던 부분은 소율과 지아의 신경전, 그로인해 행해지는 사이버폭력이다. '혀 밑에 칼이 있다.' 라는 속담이있다. 하지만 이것도 옛 말이다. 클릭 한 번에 칼날이 비수로 꽂힌다 라는 말이 새로 생겨야한다. 누군가를 향한 정확하지 않은 사실에 대한 마구잡이 유포, 질투를 가장한 언어폭력은 현재 내가 이 글을 쓰고 있는 현실에서도 수없이 행해지는 사이버폭력들이다. 마음 연약한, 연한 녹색 새싹같은 아이들의 마음에 생채기를 내는 이런 사이버폭력들을 막으려면 어떻게 해야할까...? 다른 부분들도 의미깊고 재미있었지만 나는 이 부분이 가장 걱정되고, 개선하고 싶은 부분이라 생각한다. 강력한 인터넷 실명제를 도입하여 아예 사진까지도입하여 제 얼굴을 걸고 댓글을 쓰도록 만들어야하지 않을까 생각한다.생각없는 클릭질에 가슴에 피가 철철 흐르지 않도록 그렇게 되어야 한다....! 세용의 이야기도 의미깊고 기대가 되었다. 타인에 의해 이루어지는 변화는 어디까지일까. 한 사람으로 인해 변할 수 있는 세용의 넓이와 깊이가 궁금하다. 청소년연작소설 [뜨인돌] 턴 정말 매력있고 생각할 거리를 주는 좋은 청소년 소설이었다. 내가 한 행동은 미래에 누군가가 나에게 할 수 있는 행동이다. 모든 것은 연결되어있다. 좋은 것은 좋은 것과, 혹은 나쁜 것과..그렇지만 나쁜것과 연결된 것은 전에 내가 했던 좋았던 행동으로 상쇄시킬 수 있다. 뫼비우스의 띠처럼 우리는 같은, 이어진 고리를 끊임없이 순환하는 것은 아닐까. 간만에 독서 열심히 한 것 같다. 지루하지 않게, 생감있게, 현실을 바로 볼 수 있을 수 있었던 좋은 책이었다."

print('\nCollocations among tagged words:')
tagged_words = Twitter().pos(doc)
finder = collocations.BigramCollocationFinder.from_words(tagged_words)
print(finder.nbest(measures.pmi, 10)) # top 5 n-grams with highest PMI



print('\nCollocations among words:')
words = [w for w, t in tagged_words]
ignored_words = [u'안녕']
finder = collocations.BigramCollocationFinder.from_words(words)
finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words)
finder.apply_freq_filter(3) # only bigrams that appear 3+ times
pprint(finder.nbest(measures.pmi, 10))

print('\nCollocations among tags:')
tags = [t for w, t in tagged_words]
finder = collocations.BigramCollocationFinder.from_words(tags)
pprint(finder.nbest(measures.pmi, 5))
Exemple #35
0
__author__ = 'woojin'
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()
pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.'))
pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.'))
pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
Exemple #36
0
import os
from konlpy.tag import Kkma
from konlpy.utils import pprint
from konlpy.tag import Okt

nlpy = Okt()

for filename in os.listdir("test"):
    with open(os.path.join("test", filename), 'r',
              encoding="utf-8") as f:  # open in readonly mode
        print("file :" + filename)
        sentences = f.read()
        nouns = nlpy.nouns(sentences)
        counter = {}
        for noun in nouns:
            if noun not in counter:
                counter[noun] = 1
            else:
                counter[noun] += 1
        sorter = [(value, key) for key, value in counter.items()]
        sorter.sort()
        sorter.reverse()
        pprint(sorter)
Exemple #37
0
    def __init__(self, dirname=DATA_DIR, word_list=ALPHABET, is_async=True):
        super(TwitterStreamer, self).__init__(is_async=is_async)
        self.is_async = is_async

        parser = self.get_parser()
        parser.add_argument(
            '--consumer_key',
            help='consumer key',
        )
        parser.add_argument(
            '--consumer_secret',
            help='consumer secret',
        )
        parser.add_argument(
            '--access_token',
            help='access token',
        )
        parser.add_argument(
            '--access_token_secret',
            help='access token secret',
        )
        parser.add_argument(
            '--filter_retweets',
            help='do not save potentially repetitive retweets',
            action="store_true",
        )
        parser.add_argument(
            '--remove_links',
            help='remove links included into each tweet',
            action="store_true",
        )
        parser.add_argument(
            '--remove_mentions',
            help='remove mentions included into each tweet',
            action="store_true",
        )
        parser.add_argument(
            '--output_prefix',
            help='prefix of the output file',
            default='tweet',
            type=str
        )
        parser.add_argument(
            '--output_as_onefile',
            help='save output as onefile',
            action="store_true",
        )
        parser.add_argument(
            '--output_extension',
            help='extension of the output file',
            default='txt',
            type=str
        )
        parser.add_argument(
            '--tweet_limits',
            help='stop when this amount of tweets are collected',
            default=1000000,
            type=int
        )
        parser.add_argument(
            '--time_limits',
            help='stop when n secs elapsed',
            default=1000000,
            type=int
        )
        parser.add_argument(
            '--keyword_file',
            help='file that defines a keywords line by line',
            type=str
        )

        self.options, _ = parser.parse_known_args()

        # lazy requirement checking since argparse's required option blocks initialization.
        requirements = [self.options.consumer_key, self.options.consumer_secret,
                        self.options.access_token, self.options.access_token_secret]

        flag = None
        for requirement in requirements:
            if not requirement:
                flag = 1

        if flag is not None:
            pprint("You have to provide valid consumer key, consumer_secret, access_token, access_token_secret.")

        # Parse wordlist from custom argument
        self.dirname = dirname
        if self.options.keyword_file is not None:
            try:
                reader = open(self.options.keyword_file, mode='r+', encoding='utf-8')
            except UnicodeDecodeError:
                reader = open(self.options.keyword_file, mode='r+', encoding='cp949')
            self.word_list = reader.readlines()

        else:
            self.word_list = word_list

        self.is_async = is_async
        self.streamer = None

def dedup(l):
    return list(unique_everseen(l))


def concate_tuple(t):
    return '%s_%s' % t


if __name__=='__main__':

    f = open('./dummy_article_03.txt')

    lines = convert_textfile_to_lines(f)

    # sentences = do_sentencing_by_threading(lines)
    # sentences = do_sentencing_without_threading(lines)
    sentences = '[ 천지 일보= 임 문식 기자] 4.13 총선을 나흘 남겨 둔 9일 여야 정치권이 20대 총선 성패를 좌우할 수도권 등 전국 곳곳에서 표 심 잡기에 집중, 사활을 건 유세전을 펼쳤다.'
    print "sentences"
    pprint(sentences)

    # morphs = do_parsing_by_threading(sentences)
    morphs = do_parsing_without_threading(sentences)
    print "morphs"
    pprint(morphs)

    # morphs = [('a','1'), ('b','2')]
    morphs = map(concate_tuple, morphs)
    pprint(morphs)
Exemple #39
0
from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()

pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다'));
pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.'));
pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
Exemple #40
0
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()
pprint(kkma.sentences(u'저는 대학생이구요. 소프트웨어 관련학과 입니다.'))
[저는 대학생이구요., 소프트웨어 관련학과 입니다.]
Exemple #41
0
# -*- coding: utf-8 -*-
from collections import Counter

from konlpy.corpus import kolaw
from konlpy.tag import Hannanum
from konlpy.utils import concordance, pprint
from matplotlib import pyplot
from nltk.corpus import stopwords

tmppos=[]
stopwords = [u"위원님", u"위원", u"것", u"말씀", u"우리", u"그것", u"이것", u"존경", u"이", u"저", u"저희", u"때문", u"문제", u"생각", u"미래부", u"장관님", u"거", u"때",
             u"다음", u"일", u"수", u"정부", u"정책"]
doc = unicode(open(u'회의록.txt').read(), 'utf-8')
pos = Hannanum().pos(doc)
for word in pos:
    if word[1] == 'N':
        tmppos.append(word)
for word in tmppos:
    for stopword in stopwords:
        if word[0] == stopword:
            tmppos.remove(word)

cnt = Counter(tmppos)

f = open("freq.txt", "wb")
for freq_word in cnt.most_common(50):
    f.write(freq_word[0][0].encode('utf-8')+"\n")
f.close()
print('\nTop 20 frequent morphemes:'); pprint(cnt.most_common(50))
Exemple #42
0
 def get_trend(self):
     _, self.trend = get_current_trend()
     if self.options.verbose:
         pprint(self.trend)
Exemple #43
0
print '------------------------------------------------------------------------------------'

print '-------------------- Training dataset ----------------------------------------------'
documents = [
    u"배송지연으로 인한 환불정책을 알고 싶습니다. 되기는 되는 것인가요", u"배송이 완료되었습니다. 고객님",
    u"해당 상품 두가지는 일괄 배송이 됩니다", u"고객님 제가 알아보고 다시 연락드리겠습니다", u"상담원 홍길동이었습니다",
    u"내일까지 배송될 예정입니다. 조금만 기다려주십시오", u"상품 파손시 환불이 불가능합니다",
    u"상품 수거 후 환불완료가 될 것입니다. 내일 방문하는 택배원에게 상품을 보내주세요", u"지금 집에 없는데 경비실에 맡겨주세요",
    u"아직도 배송이 안되었는데 언제 배송되나요 연락도 없구요",
    u"고객님이 주문하신 상품이 품절이 되었습니다. 결제를 취소처리하려 합니다",
    u"고객님 품절된 상품 대신 다른 상품으로 대체 발송하려 하는데 동의하시나요",
    u"배송기사가 상자를 던져서 상품이 파손되었습니다. 환불을 해주시던지 다른 상품을 보내주세요",
    u"배송 중 파손이 된 것 같은데요. 파손 보상책이 준비되어 있나요"
]

pprint(documents)

#stoplist = [u"",]

vectorizer = TfidfVectorizer(min_df=2, analyzer='word', tokenizer=tokenizer)
tfidf = vectorizer.fit_transform(documents)

print '\n'
print '-------------------- Keywords -------------------------------------------------------'
pprint(vectorizer.vocabulary_)
print '-------------------- End of keywords ------------------------------------------------'
print '\n'

new_document = []
#new_document.append(u"배송 중 상자가 파손되었어요")
new_document.append(sys.argv[1])
Exemple #44
0
    def __init__(self, dirname=DATA_DIR, word_list=ALPHABET, is_async=True):
        super(TwitterStreamer, self).__init__(is_async=is_async)
        self.is_async = is_async

        parser = self.get_parser()
        parser.add_argument(
            '--consumer_key',
            help='consumer key',
        )
        parser.add_argument(
            '--consumer_secret',
            help='consumer secret',
        )
        parser.add_argument(
            '--access_token',
            help='access token',
        )
        parser.add_argument(
            '--access_token_secret',
            help='access token secret',
        )
        parser.add_argument(
            '--filter_retweets',
            help='do not save potentially repetitive retweets',
            action="store_true",
        )
        parser.add_argument(
            '--remove_links',
            help='remove links included into each tweet',
            action="store_true",
        )
        parser.add_argument(
            '--remove_mentions',
            help='remove mentions included into each tweet',
            action="store_true",
        )
        parser.add_argument('--output_prefix',
                            help='prefix of the output file',
                            default='tweet',
                            type=str)
        parser.add_argument(
            '--output_as_onefile',
            help='save output as onefile',
            action="store_true",
        )
        parser.add_argument('--output_extension',
                            help='extension of the output file',
                            default='txt',
                            type=str)
        parser.add_argument(
            '--tweet_limits',
            help='stop when this amount of tweets are collected',
            default=1000000,
            type=int)
        parser.add_argument('--time_limits',
                            help='stop when n secs elapsed',
                            default=1000000,
                            type=int)
        parser.add_argument('--keyword_file',
                            help='file that defines a keywords line by line',
                            type=str)

        self.options, _ = parser.parse_known_args()

        # lazy requirement checking since argparse's required option blocks initialization.
        requirements = [
            self.options.consumer_key, self.options.consumer_secret,
            self.options.access_token, self.options.access_token_secret
        ]

        flag = None
        for requirement in requirements:
            if not requirement:
                flag = 1

        if flag is not None:
            pprint(
                "You have to provide valid consumer key, consumer_secret, access_token, access_token_secret."
            )

        # Parse wordlist from custom argument
        self.dirname = dirname
        if self.options.keyword_file is not None:
            try:
                reader = open(self.options.keyword_file,
                              mode='r+',
                              encoding='utf-8')
            except UnicodeDecodeError:
                reader = open(self.options.keyword_file,
                              mode='r+',
                              encoding='cp949')
            self.word_list = reader.readlines()

        else:
            self.word_list = word_list

        self.is_async = is_async
        self.streamer = None
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()

string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.'

pprint(kkma.nouns(string))
Exemple #46
0
def test_utils_pprint(capsys): # Fixture `capsys` allows stdout/stderr captures
    from konlpy.utils import pprint
    pprint([u"저는 소프트웨어 관련학과 입니다."])
    out, err = capsys.readouterr()
    assert out == u"[저는 소프트웨어 관련학과 입니다.]\n"
Exemple #47
0
'''
Created on 2016. 11. 2.

형태소
'''
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()
pprint(kkma.sentences('여러분, 안녕하세요. 반갑습니다.'))
print()
pprint(
    kkma.nouns(
        u'오늘 폭염이 주춤했지만 일부지방은 폭염 특보 속에 35도 안팎의 찜통더위가 기승을 부렸는데요.자세한 날씨, YTN 중계차 연결해 알아보겠습니다.'
    ))
print()
pprint(kkma.pos(u'오류보고는 실행환경, 에러메세지와 함께'))
Exemple #48
0
def get_nouns_duplicate(text):
    mecab = Mecab()
    arr = mecab.nouns(text)
    return arr


def get_nouns(text, chunk=500, mfv=20):
    mecab = Mecab()
    arr = mecab.nouns(text)
    return list(set(arr))


#	for i in range(len(arr)/chunk):
#		nngs = []
#		for j in range(chunk):

#			if arr[i*chunk + j][1] in ['NNG', 'NNP']:
#				nngs.append(arr[i*chunk + j][0])

#		count = Counter(nngs)
#		brr = count.most_common(mfv)
#		pprint(brr)
#	return brr

if __name__ == "__main__":
    data_name = '소나기'
    text = read_text(data_name)
    nouns = get_nouns(text)
    pprint(nouns)
Exemple #49
0
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.corpus import kolaw
from konlpy.utils import pprint
from nltk import collocations


measures = collocations.BigramAssocMeasures()
doc = kolaw.open('constitution.txt').read()

print('\nCollocations among tagged words:')
tagged_words = Kkma().pos(doc)
finder = collocations.BigramCollocationFinder.from_words(tagged_words)
pprint(finder.nbest(measures.pmi, 10)) # top 5 n-grams with highest PMI

print('\nCollocations among words:')
words = [w for w, t in tagged_words]
ignored_words = [u'안녕']
finder = collocations.BigramCollocationFinder.from_words(words)
finder.apply_word_filter(lambda w: len(w) < 2 or w in ignored_words)
finder.apply_freq_filter(3) # only bigrams that appear 3+ times
pprint(finder.nbest(measures.pmi, 10))

print('\nCollocations among tags:')
tags = [t for w, t in tagged_words]
finder = collocations.BigramCollocationFinder.from_words(tags)
pprint(finder.nbest(measures.pmi, 5))
context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind('tcp://127.0.0.1:%s' % port)

while True:
	print 'in the loop'
	# Wait for next request from client
	message = socket.recv()
	result = kkma.nouns(message);
	result = ', '.join(result)
	print '------'
	print result
	socket.send_string(result) # for socker.end unicode is not allowed use send_string

"""
string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.'
string2 = u'5학년되니까 학교근처엔 도저히 먹을게없다'
string3 = u'카이리스님께 사과문올립니다'

start = time.time()

pprint(kkma.nouns(string))
pprint(kkma.nouns(string2))
pprint(kkma.nouns(string3))


end = time.time()

print (end - start)
"""