Exemple #1
0
def keyword_extractor(title, highlight):
    konl = Kkma()
    eng_title = TextBlob(re.sub("[^A-Za-z]", ",", title.strip())).noun_phrases
    eng_highlight = TextBlob(re.sub("[^A-Za-z]", " ",
                                    highlight.strip())).noun_phrases
    title_nouns = konl.nouns(title)
    highlight_nouns = konl.nouns(highlight)

    #line = re.sub("[^A-Za-z]", "", title.strip())

    keyword_list = {i: 2 for i in title_nouns}
    for i in highlight_nouns:
        try:
            keyword_list[i] += 1
        except:
            keyword_list[i] = 1

    for i in eng_title:
        keyword_list[i] = 2

    for i in eng_highlight:
        try:
            keyword_list[i] += 1
        except:
            keyword_list[i] = 1

    keyword_list = sorted(keyword_list.items(),
                          key=operator.itemgetter(1),
                          reverse=True)[:10]

    keywords = {}
    for i, k in enumerate(keyword_list):
        keywords[str("k" + str(i))] = k[0]

    return keyword_list
Exemple #2
0
def get_nouns(file_path):
    from konlpy.tag import Kkma
    km = Kkma()
    noun_list = []
    with open(file_path, 'r') as f:
        lines = f.readlines()  # 한 라인씩 읽지 않으면 kn.nouns() 에서 에러남
        for line in lines:
            if km.nouns(line) != list(''):
                nouns = km.nouns(line)
                for noun in nouns:
                    stopwords = get_stopwords()
                    if noun not in stopwords:
                        noun_list.append(noun)

    return (noun_list)
Exemple #3
0
 def tokenize(self, text):
     try:
         from konlpy.tag import Kkma
     except ImportError as e:
         raise ValueError("Korean tokenizer requires konlpy. Please, install it by command 'pip install konlpy'.")
     kkma = Kkma()
     return kkma.nouns(text)
def get_tags(text, ntags=50, multiplier=10):
    #  h = Hannanum()
    h = Kkma()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': c*multiplier*2 }\
                for n, c in count.most_common(ntags)]
Exemple #5
0
def atemedic(df701, ingredients):
    df801 = df701.iloc[:, [0, 5, 6]]
    #('item_seq','patient','prohibit_ingredient_id')

    df801 = df801.drop_duplicates()
    list5 = []
    list5 = df801.values.tolist()
    doc3_list = np.array(list5).flatten().tolist()
    doc4_list = []

    for i in range(1, len(doc3_list), 2):
        doc4_list.append(doc3_list[i])

    kkma = Kkma()
    list6 = []
    list7 = []
    for i in range(1, len(doc4_list)):
        list6 = kkma.nouns(doc4_list[i])
        list7.append(' '.join(list6))

    tfidf_vect_simple = TfidfVectorizer()
    list7.insert(0, ingredients)

    feature_vect_simple = tfidf_vect_simple.fit_transform(list7)
    feature_vect_dense = feature_vect_simple.todense()

    similarity_simple_pair = cosine_similarity(feature_vect_simple[0],
                                               feature_vect_simple)

    list8 = []
    for i in range(len(list5)):
        if (similarity_simple_pair[0, i] != 0.0):
            list8.append(i)
    return (doc3_list, list8)
Exemple #6
0
    def set_knlpy(self):
        """
        Kkma 는 nouns() 시 단어를 한번만 표시되고 속도가 느리지만 추출결과가 깔끔하다
        Hannanum 은 nouns() 시 단어를 매번 표시되고 (빈도수체크가능) 속도가 빠르지만 추출결과가 매끄럽지 않다.
        """
        k = Kkma()
        # k = Hannanum()
        if not k:
            logger.error("must have knlpy!")
            exit()

        startTime = time.time()
        k.nouns("intial ")
        self.k = k
        checkTime = time.time() - startTime
        logger.debug("intial time : %f", checkTime)
Exemple #7
0
def search_keyword():
    url = 'https://openapi.naver.com/v1/search/book.json?'
    client_id = "QpvvkiISGC1mn16KVb3d"
    client_secret = "ukKNKa8DVk"
    keyword = request.form.get('keyword')
    query_string = "query=" + keyword + "&display=10&start=1&sort=count"
    header = {
        "X-Naver-Client-ID": client_id,
        "X-Naver-Client-secret": client_secret
    }
    r = requests.get(url + query_string, headers=header)
    books = json.loads(r.text)['items']
    all_text = ''
    for book in books:
        all_text += book['description']
    kkma = Kkma()
    ex_sent = kkma.sentences(all_text)
    nouns = []
    for sent in ex_sent :
        for noun in kkma.nouns(sent):
            if len(str(noun)) > 2 and not(match('^[0-9]', noun)):
                nouns.append(noun)
    nouns_count = Counter(nouns)
    chart_index = sorted(nouns_count, reverse=True, key=lambda item: item[1])
    chart_values = list(nouns_count.values())

    return jsonify({'result': 'success', 'books': books, 'chart_index': chart_index, 'chart_values' : chart_values})
Exemple #8
0
class bot(object):
    def __init__(self):
        self.kkma = Kkma()
        self.Prime_Word = pd.read_csv('1. Word Table.csv', encoding='CP949')
        self.Answer_DB = pd.read_csv('2. Info DB.csv', encoding='CP949')
        self.Sentence_DB = pd.read_csv('3. Info DB Sentence.csv',
                                       encoding='CP949')

    def Conversation(self, chat_text):
        Analized_Nouns = self.kkma.nouns(chat_text)
        return Analized_Nouns

    def Translating_Word(self, Words):
        for i in range(len(Words)):
            for j in range(len(self.Prime_Word)):
                if Words[i] == self.Prime_Word.loc[j, 'Word']:
                    Words[i] = self.Prime_Word.loc[j, 'Mapping']
        return Words

    def Answering(self, Words):
        for i in range(len(Words)):
            for j in range(len(self.Answer_DB)):
                if Words[i] == self.Answer_DB.loc[j, '항목']:
                    Answer_Sentence = self.Sentence_DB.loc[j, '문장1']
                    Answer_Sentence = Answer_Sentence.replace(
                        '[' + self.Answer_DB.loc[j, '항목'] + ']',
                        self.Answer_DB.loc[j, '#1'])
                    return Answer_Sentence
        return '없음'
Exemple #9
0
def get_q_type(dbconn, cursor, u_text) : 
	try : 
		cursor.execute(f"""
			SELECT 
				Q_TEXT, A_TEXT, Q_MORPHEMES, Q_TYPE
			FROM
				TBL_QNA_CHAT_SET_LIST
		""")
		rows = cursor.fetchall()
	except Exception as e : 
		print(f'error! >> insert_qna_content >> {e}')
	finally : 
		q_type = 0
		all_values = []
		for row in rows : 
			all_values.append(row)
		
		match_q_type_list = []
		for all_value in all_values :
			matchPer = SequenceMatcher(None, u_text, all_value[0]).ratio() * 100
			# print(matchPer >= 70)
			# print(f'[{matchPer}% 일치][{all_value[3]}] {all_value[0]}')
			if matchPer >= 55 : 
				# print(70)
				match_q_type_list.append(all_value[3])
				print(f'[{matchPer}% 일치][Q_type : {all_value[3]}] {all_value[0]}')
		
		if len(match_q_type_list) > 0 :
			match_q_type_list = sorted(match_q_type_list, reverse=True)		
			q_type = match_q_type_list[0]

		# 정의해놓은 대화뭉치가 없는 경우 > 답변을 직접 등록할 수 있도록 유도
		# 새로운 질문과 기존 질문들의 유사도 체크하여 높은 유사도의 질문을 (최소 70% 이상) 노출
		if q_type == 0 :
			kkma = Kkma()
			# 사용자 질문 명사
			u_text_nouns = kkma.nouns(u_text)
			q_text_nouns_group = []
			for all_value in all_values :
				# 텍스트 뭉치 명사
				if all_value[2] != '[]' : 
					q_text_nouns_group.append([ast.literal_eval(all_value[2]), all_value[3]])

			point_list = []
			for q_text_nouns in q_text_nouns_group :
				match_point = 0
				for q_noun in q_text_nouns[0] : 
					for u_noun in u_text_nouns : 
						if q_noun == u_noun : 
							match_point += 1
				if match_point > 0 : 
					point_list.append([match_point, q_text_nouns[1]])

			if len(point_list) > 0 : 
				point_list = sorted(point_list, reverse=True)
				print(point_list[0][0])
				q_type = point_list[0][1]

		print(q_type)	
		return [q_type, all_values]
def tokenizer(text_data, stopword_list):
    kkma = Kkma()
    texts = []
    for t in text_data:
        tmp = [n for n in kkma.nouns(t) if n not in stopword_list]
        texts.append(tmp)
    return texts
Exemple #11
0
class Doc2Vec:
    def __init__(self):
        self.doc2vec = TfidfVectorizer()
        self.kkma = Kkma()

    def fit_doc2vec(self, doc_nouns):
        self.doc2vec.fit(doc_nouns)

    def get_nouns(self, doc):
        return self.kkma.nouns(doc)

    def get_split(self, doc):
        return doc.split(' ')

    def get_vec(self, doc):
        return self.doc2vec.transform([doc]).todense()

    def cos_similarity(self, vect1, vect2):
        dot_procduct = np.dot(vect1, vect2.reshape(-1, 1))
        l2_norm = np.sqrt(np.sum(np.square(vect1), axis=-1)) * np.sqrt(
            np.sum(np.square(vect2), axis=-1))

        return dot_procduct / l2_norm

    def get_score(self, doc1, doc2):
        vect1 = self.get_vec(doc1)
        vect2 = self.get_vec(doc2)

        return self.cos_similarity(vect1, vect2)

    def get_similarity(self, doc1, doc2):
        vec1 = self.get_vec(doc1)
        vec2 = self.get_vec(doc2)

        return np.dot(vec1, vec2.T)
Exemple #12
0
    def find_most_mentioned(self):
        """
        Returns how many times that people has been mentioned in conversation
        :return: dictionary containing person's name and number of mentions
        """
        self._rewind()

        # get all conversations
        all_conversations = self.get_all_conversations()

        # word parser objects
        mecab = Mecab()
        kkma = Kkma()

        # parse all conversation words, and get only nouns
        all_nouns = list()
        for conversation in all_conversations:
            all_nouns += mecab.nouns(conversation)

        # exclude family name(성) from name
        names_list = list()
        for name in self.get_all_names():
            preprocessed_name = kkma.nouns(name)
            for data in preprocessed_name:
                if len(data) != 1:
                    names_list.append(data)

        # compare two list
        mentioned_people = [
            person for person in all_nouns if person in names_list
        ]

        # count using Counter and return
        cnt = Counter(mentioned_people)
        return cnt.most_common(len(cnt))
Exemple #13
0
def konlpy():

    # value = request.form['konlpy_tag']
    value = request.args.get('search')

    kkma = Kkma()
    
    a = kkma.pos(value)
    noun = kkma.nouns(value)
    
    word = []
    pos = []

    for i in a:
        # print(i[0] + ',' + i[1])
        word.append(i[0])
        pos.append(i[1])

    print(noun)
    print(word)
    print(pos)
   
    result =  {'word': word , 'pos': pos , 'noun': noun }
    print(result)

    return result
Exemple #14
0
def run_kkma():
    kkma = Kkma()
    start_time = time.time()
    print('kkma 시작')
    kkma_morphs = kkma.morphs(news1)
    kkma_nouns = kkma.nouns(news1)
    kkma_pos = kkma.pos(news1)
    end_time = time.time()
    print('kkma 끝 - %s 초' % str(end_time - start_time))
    kkma_sentences = kkma.sentences(news1)

    with open('kkma.txt', 'w', encoding='utf-8') as fstream:
        fstream.write('kkma time : %s s\n' % str(end_time - start_time))
        fstream.write('kkma_morphs\n')
        write_list(kkma_morphs, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_nouns\n')
        write_list(kkma_nouns, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_pos\n')
        write_pos(kkma_pos, fstream)
        fstream.write('\n\n')

        fstream.write('kkma_sentences\n')
        write_list(kkma_sentences, fstream)
        fstream.write('\n')
Exemple #15
0
    def parse_data(self, label, h, i):
        Y = self.y_vocab.get(label)
        if Y is None and self.div in ['dev', 'test']:
            Y = 0
        if Y is None and self.div != 'test':
            return [None] * 2
        product = h['product'][i]
        image = h['img_feat'][i]
        bcate = h['bcateid'][i]
        mcate = h['mcateid'][i]
        scate = h['scateid'][i]
        dcate = h['dcateid'][i]
        maker = h['maker'][i]
        brand = h['brand'][i]
        price = h['price'][i]
        model = h['model'][i]

        if not (('참조' in brand) or ('기타' in brand) or ('없음' in brand) or
                ('미분류' in brand)):
            product += ' ' + brand
        if not (('참조' in maker) or ('기타' in maker) or ('없음' in maker) or
                ('미분류' in maker)):
            product += ' ' + maker

        print product

        kkma = Kkma()
        p_model = kkma.nouns(product)

        for pStr in p_model:
            product += ' ' + pStr
        product = re_sc.sub(' ', product).strip().split()

        words = [w.strip() for w in product]
        words = [
            w for w in words
            if len(w) >= opt.min_word_length and len(w) < opt.max_word_length
        ]

        if not words:
            return [None] * 2

        if cate_type == 'bm':
            x = [hash(w) % opt.bm_unigram_hash_size + 1 for w in words]
        elif cate_type == 's':
            x = [hash(w) % opt.s_unigram_hash_size + 1 for w in words]
        elif cate_type == 'd':
            x = [hash(w) % opt.d_unigram_hash_size + 1 for w in words]

        xv = Counter(x).most_common(opt.max_len)

        x = np.zeros(opt.max_len, dtype=np.float32)
        v = np.zeros(opt.max_len, dtype=np.int32)
        for i in range(len(xv)):
            x[i] = xv[i][0]
            v[i] = xv[i][1]

        return Y, (x, v, image, bcate, mcate, scate, dcate)
Exemple #16
0
def wordcloud(news_content_list, page_info_list, img_url):
    for i in range(len(news_content_list)):
        try:
            kkma = Kkma()
            tokens_ko = kkma.nouns(news_content_list[i])
            ko = nltk.Text(tokens_ko, name=page_info_list[i][0])
            data = ko.vocab().most_common(100)

            tmp_data = dict(data)

            korea_coloring = np.array(Image.open("C:\Study\project_idol\web_crawl\/Korea.png"))
            image_colors = ImageColorGenerator(korea_coloring)
            wordcloud= WordCloud(font_path = 'c:\\windows\\fonts\\NanumGothic.ttf',
                                 relative_scaling=0.1,
                                 mask=korea_coloring,
                                 background_color='black',
                                 min_font_size=4,
                                 max_font_size=40,
                                 ).generate_from_frequencies(tmp_data)
            plt.figure(figsize=(12,12))
            plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
            title = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', page_info_list[i][1])
            wordcloud.to_file("C:\Study\project_idol\static\wordcloud\/" + title +".png")
            plt.axis("off")
            plt.cla()
            url = "wordcloud\/" + title +".png"
            url = re.sub('/','',url)
            img_url.append(url)
            # plt.show()
        except Exception as e:
            no_img = 'wordcloud\/noimage.jpg'
            no_img = re.sub('/', '', no_img)
            img_url.append(no_img)
            print("exceptions is ", e)
            pass
# input = input('검색할 단어 > ')
# page_info_list = []
# url_list = []
# title_list = []
# news_content_list = []
# img_url = []
# value = []
# content_summarize_list = []
# call_and_print(input, 1, page_info_list)
# for i in range(len(page_info_list)):
#     url_list.append(page_info_list[i][1])
# print(url_list)
#
# make_content(url_list, news_content_list, content_summarize_list, title_list)
# wordcloud(news_content_list, page_info_list, img_url)
# for i in range(len(title_list)):
#     values = (title_list[i], url_list[i], img_url[i], content_summarize_list[i])
#     value.append(values)
# print(value)
# print(page_info_list)
# print(title_list)
Exemple #17
0
class Analyze:
    def __init__(self, string):
        self.string = u"%s" %string
        self.kkma = Kkma()

    def parse_phrase_to_morphemes(self):
        return self.kkma.morphs(self.string)
    
    def noun_extractor(self):
        return self.kkma.nouns(self.string)
Exemple #18
0
def get_tags(text, ntags=50, multiplier=30):  # 폰트 크기 조절은 multiplier값을 조정해서
    # h = Hannanum()
    r = lambda: random.randint(0, 255)
    color = lambda: (r(), r(), r())
    h = Kkma()
    text = unicode(text, 'utf-8')
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\
                for n, c in count.most_common(ntags)]
Exemple #19
0
def get_tags(text, ntags=50):  # 빈도수 계산 함수
    spliter = Kkma()
    nouns = spliter.nouns(text)  # text에서 명사 추출
    count = Counter(nouns)  # 명사의 빈도 수 저장
    return_list = []
    for n, c in count.most_common(ntags):
        temp = {'tag': n, 'count': c}
        return_list.append(temp)

    return return_list
def get_tags(text, ntags=50, multiplier=30):           # 폰트 크기 조절은 multiplier값을 조정해서
    # h = Hannanum()
    r = lambda: random.randint(0,255)
    color = lambda: (r(), r(), r())
    h = Kkma()
    text = unicode(text, 'utf-8')
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\
                for n, c in count.most_common(ntags)]
def _chat_with_mybot(text):
    # 키워드 매칭을 위한 딕셔너리
    words = {
        'jobs': [],
        'names': ['이름', '성함'],
        'age': ['나이', '살', '쌀'],
        'question': ['취업정보', '질문'],
        'greetings': ['안녕', '반가워', '하이', '방가']
    }
    jobs = _read_file_job_category('job_category.txt')
    anw = []
    check = True
    if text == '<@UL9K54M32>':
        anw.append(u'안녕나는 챗봇이얌~~!! 취업정보를 알려주는 봇이얌 ^_^ 난 한국말만 알아들어~!!')
        return u'\n'.join(anw)
    kkma = Kkma()
    keywords = kkma.nouns(text)
    words['jobs'] = list(jobs.keys())

    #print(keywords) # 사용자 입력 키워드 추출 (확인용)

    for i in range(len(words['greetings'])):
        if words['greetings'][i] in keywords:
            anw.append('안녕~ 나도' + words['greetings'][i] + '^_^\n')
            check = False

    for i in range(len(words['names'])):
        if words['names'][i] in keywords:
            anw.append('내 이름은 봇이얌 봇봇봇~!!\n')
            check = False

    for i in range(len(words['question'])):
        if words['question'][i] in keywords:
            anw.append(words['question'][i] + '??' + '알았어~~\n')
            check = False

    for i in range(len(words['age'])):
        if words['age'][i] in keywords:
            anw.append('내 나이는 20살이야~ 아주 젊지 ^_*\n')
            check = False

    for i in range(len(words['jobs'])):
        if words['jobs'][i] in keywords:
            # 크롤링한 직업정보 반환받고
            #print(jobs[words['jobs'][i]])
            _jobs = crawl._crawl_newbie_info(jobs[words['jobs'][i]])

            return _jobs

    if check:
        anw.append('뭐라는 거야 ~~ -3- 그건 몰라~\n')

    return u'\n'.join(anw)
class input_preprocessing:
    def __init__(self):
        self.textfile = 'url_text.txt'
        self.kkma = Kkma()

    def get_text(self):
        # 복사 붙여넣기로 넣을 수 없어서 텍스트파일에 넣고 가져옴
        # 나중에는 그냥 string 받아서 시작하면 된다.

        with open('.\\Dataset\\' + self.textfile, 'rb') as f:
            tmp = f.read().decode('utf-8')
        return tmp

    def extractkor(self, _s):
        try:
            hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
            result = hangul.sub('', _s)
            # print(result)
            result2 = result.split(' ')
            result2 = [item for item in result2 if item != ' ']
            result2 = [item for item in result2 if item != "" '' ""]
            return result2
        except Exception as error:
            # print("error!",error)
            return 1

    def splitkor_kornouns(self, _list):
        _temp_korbag = []
        try:
            # for item in tqdm(_list, ascii= True, desc='명사 추출'):
            for item in _list:
                tempstr = str(item)
                tplist = self.kkma.nouns(tempstr)
                # print(tplist)
                _temp_korbag.append(tplist)
        except Exception as error:
            print("for문 에러", error)
        _temp_korbag = list(itertools.chain(*_temp_korbag))
        return _temp_korbag

    def length_join(self, _bodytext):
        junk = []
        for word in _bodytext:
            if len(word) < 2:
                junk.append(word)
        for word2 in junk:
            try:
                _bodytext.remove(word2)
            except:
                pass
        _bodytext = [" ".join(_bodytext)]
        return _bodytext
def newpost(request):
    if request.method == "POST":
        form = PostForm(request.POST)

        qu = (request.POST.get('text'))
        ##qu에 문장들 들어온다
        texts = []

        sentList = []
        result = ""

        kkma = Kkma()
        texts = kkma.sentences(qu)
        for text in texts:
            text = text.strip("\n")
            print('start initializing : ', text)
            kkma = Kkma()
            kkma.nouns('initializing')

            #형태소 분석 시작
            answer, rowVerb, rightVerb, subj = start(text)

            temp = []
            temp.append(answer)
            temp.append(rowVerb)
            temp.append(rightVerb)
            temp.append(subj)
            sentList.append(temp)

            ## 리스트 [['주문하신 커피가 ', '나오셨습니다.', '나왔습니다']]
            #print(sentList)

        if form.is_valid():
            return render(request, 'elections/output.html',
                          {'sentList': sentList})

    else:
        form = PostForm()
        return render(request, 'elections/index.html', {'form': form})
Exemple #24
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
Exemple #25
0
def summarize(contents, radius, k):
    kkma = Kkma()
    keywords = []
    nouns = np.array([hash(elem) for elem in kkma.nouns(contents)], np.int64)
    size = nouns.size
    points = np.zeros([1000, size], np.int)
    i = 0
    sentences = contents.replace('\n', '').split('.')
    for sentence in sentences:
        hash_words = np.array([hash(elem) for elem in kkma.nouns(sentence)])
        arr = []
        for j in range(size):
            points[i][j] = np.count_nonzero(hash_words == nouns[j])
        # print(points[i])

        i += 1
    group = np.random.randn(i)
    for i in range(i):
        arr = []
        count = 0
        for j in range(i):
            if i != j:
                distance = np.sum((points[i] - points[j])**2)**0.5
                if distance <= radius:
                    arr.append(j)
        if len(arr) >= k:
            for elem in arr:
                group[elem] = group[i]
    result = []
    for elem in np.unique(group):
        try:
            # result.append(sentences[tolist.index(elem)])
            result.append(sentences[np.where(group == elem)[0][0]])
        except IndexError:
            pass
    return result
Exemple #26
0
def insert_qna_content(dbconn, cursor):
    file_path = f'ap_qna_set2.xlsx'
    load_wb = load_workbook(file_path, data_only=True)
    load_ws = load_wb['chat4']

    all_values = []
    for row in load_ws.rows:
        row_value = []
        for cell in row:
            row_value.append(cell.value)
        all_values.append(row_value)

    kkma = Kkma()
    for idx, values in enumerate(all_values):
        if idx > 0:
            q_text = remove_sc(values[0])
            a_text = remove_sc(values[1])
            intent = values[2]
            img_src = values[3]
            entities = values[4]
            intent_tag = values[5]
            a_type = values[6]
            home_btn = values[7]
            q_nouns = kkma.nouns(q_text)
            if len(q_nouns) > 0:
                q_nouns = str(q_nouns)
            else:
                q_nouns = '[]'
            print(q_nouns)
            try:
                cursor.execute(f"""
					INSERT IGNORE INTO TEST_TBL_AP_QNA_CHAT_SET_LIST 
					(
						Q_TEXT, INTENT, A_TYPE, A_TEXT, PREV_BTN, HOME_BTN, Q_MORPHEMES, IMG_SRC, ENTITIES, INTENT_TAG, UPDATE_DATE
					)
					VALUES
					(
						"{q_text}", "{intent}", "{a_type}", "{a_text}", 3, {home_btn}, "{q_nouns}", "{img_src}", "{entities}", "{intent_tag}", NOW()
					)
				""")
            except Exception as e:
                print(f'error! >> insert_ap_qna_content >> {e}')
            finally:
                print(
                    f'[{idx}/{len(all_values)}({round((idx / len(all_values) * 100), 2)}%)] complete!!'
                )
                dbconn.commit()
                time.sleep(0.1)
Exemple #27
0
def make_tags(words):
    result_tag = ""

    try:
        kkma = Kkma()
        nouns = kkma.nouns(words)

        for index, n in enumerate(nouns):
            if (index > 0):
                result_tag = result_tag + ", "
            result_tag = result_tag + ("#" + n)

    except Exception as e:
        error_print(e, "make_tags", "", "M")

    finally:
        return result_tag
Exemple #28
0
def ceshi(i):
    conn = pymysql.connect( # 创建数据库连接
    host='gujiakai.softether.net', # 要连接的数据库所在主机ip
    user='******', # 数据库登录用户名
    password='******', # 登录用户密码
   database='library', # 连接的数据库名,也可以后续通过cursor.execture('user test_db')指定
    charset='utf8mb4' # 编码,注意不能写成utf-8
)
    cursor = conn.cursor()
    t = Kkma()
    cursor.execute("select introduction from book_info where book_id=%s",i)
    res=cursor.fetchall()
    cut_words=""
    res=str(res)
    nouns = t.nouns(res)
    cut_words=""
    return nouns
Exemple #29
0
def chat_response():
    kkma = Kkma()
    query = request.args.get('q')
    response = None
    query_noun = kkma.nouns(str(query))
    print(query_noun)
    if "안녕" in query_noun:
        response = '안녕하세요!'
        return json.jsonify(response=response)

    elif "교열" and "가격" in query_noun:
        response = "3장 기준으로 5천원입니다.\n3장 이상의 경우에는 8천원입니다."
        return json.jsonify(response=response)

    elif "교열" and "언어" in query_noun:
        response = "현재는 중국학생들을 대상으로 한국어 문장을 교열하고 있습니다.\n영어 보고서 교열 서비스도 준비중에 있습니다."
        return json.jsonify(response=response)

    elif "교열" and '얼마' in query_noun:
        response = "3장 기준으로 5천원입니다.\n3장 이상의 경우에는 8천원입니다."
        return json.jsonify(response=response)

    elif "교열" and "견적" in query_noun:
        response = "3장 기준으로 5천원입니다.\n3장 이상의 경우에는 8천원입니다."
        return json.jsonify(response=response)

    elif "교열가" in query_noun:
        response = "교열가 정보는 실제 의뢰를 해주신 경우에 매칭되는 시스템입니다."
        return json.jsonify(response=response)

    elif "비밀" in query_noun:
        response = "물론입니다.\n교열이 완료된 문서에 대해 2주간 보관후 폐기를 원칙으로 하고 있습니다."
        return json.jsonify(response=response)

    elif "감사" in query_noun:
        response = "저희 바오가오 서비스를 이용해 주셔서 감사합니다.\n더욱 완성도 높은 서비스로 보답하겠습니다."
        return json.jsonify(response=response)

    elif "결제" in query_noun:
        response = "6월 1일까지는 무료로 이용이 가능합니다.\n유료화 이후에는 신용카드와 체크카드로 결제가 가능합니다."
        return json.jsonify(response=response)

    elif "교정" and "번역" and "차이" in query_noun:
        response = "번역은 원문만 제공해주시면 번역요청언어로 번역 서비스를 제공합니다.\n교정은 스스로 문장을"
        return json.jsonify(response=response)
Exemple #30
0
    def _mk_word_cloud_korean(self):
        target = ' '.join(self.text)

        kkma = Kkma()
        n = kkma.nouns(target)

        n = [temp for temp in n if len(temp) != 1 if not temp.isdecimal()]

        text = nltk.Text(n)
        data = text.vocab()
        data500 = data.most_common(500)

        dic = dict(data500)

        # Make word cloud object
        wc = WordCloud(font_path='/Library/Fonts/Arial Unicode.ttf', max_font_size=80, min_font_size=10,
                       background_color=self.color, mask=self.mask)

        self.wordcloud = wc.generate_from_frequencies(dic)
Exemple #31
0
def process(info, sim):
    input_sim = sim  # input data 유사성 민감도 지정 / 숫자가 작을수록 관련 없는게 나올 확률이 커짐 / 최소 50이상 설정
    input_text = info
    input_text_list = input_text.split(' ')  # input data 띄어쓰기로 나누기
    eng_text = re.sub('[^a-zA-z]', ' ', input_text).strip()
    # print(word_tokenize(input_text))
    # print(input_text_list)

    kkma = Kkma()  # 꼬마를 작용시 분모가 중복 되는 경우가 생김, 이를 제거해야 함
    copy = []
    for txt in input_text_list:
        txt_ = kkma.nouns(txt)
        # print(txt_)

        if len(txt_) > 1:  # (명사가 쪼개졌을 경우)
            max_string = max(txt_, key=len)  # 가장 긴 값을 제거 (중복값)
            txt_.remove(max_string)

        copy += txt_
    # print(copy)

    if len(copy) > 3:
        del_list = []
        for i in range(math.ceil(len(copy) - 2)):
            overlap_txt = ''.join(
                (itemgetter(i, i + 2)(copy))
            )  # abc를 kkma로 쪼갤 경우 =>  a, ab, abc, b, c => abc 제거 => ab를 제거하는 과정
            if overlap_txt in copy:
                del_list.append(overlap_txt)
                # print(del_list)
        [i for i in del_list
         if not i in copy or copy.remove(i)]  # 차집합인데 순서가 안 바뀜
    text = ' '.join(copy)

    if input_sim > 45:
        text += ','  # ,를 넣을 경우 강제로 기업설명으로 인식시켜서 조금 더 제한적인 슬로건 등장

    # 영어 슬로건이 포함 된 경우 초기상태로
    if eng_text:
        if eng_text in input_text:
            text = input_text

    return text
Exemple #32
0
def insert_qna_content(dbconn, cursor):
    file_path = f'qna_set.xlsx'
    load_wb = load_workbook(file_path, data_only=True)
    load_ws = load_wb['chat4']

    all_values = []
    for row in load_ws.rows:
        row_value = []
        for cell in row:
            row_value.append(cell.value)
        all_values.append(row_value)

    kkma = Kkma()
    for idx, values in enumerate(all_values):
        q_text = values[0]
        a_text = values[1]
        q_type = values[2]
        q_nouns = kkma.nouns(q_text)
        if len(q_nouns) > 0:
            q_nouns = str(q_nouns)
        else:
            q_nouns = '[]'
        print(q_nouns)
        try:
            cursor.execute(f"""
				INSERT IGNORE INTO TBL_QNA_CHAT_SET_LIST 
				(
					Q_TEXT, A_TEXT, Q_MORPHEMES, Q_TYPE, UPDATE_DATE
				)
				VALUES
				(
					"{q_text}", "{a_text}", "{q_nouns}", {q_type}, NOW()
				)
			""")
        except Exception as e:
            print(f'error! >> insert_qna_content >> {e}')
        finally:
            print(
                f'[{idx}/{len(all_values)}({round((idx / len(all_values) * 100), 2)}%)] complete!!'
            )
            dbconn.commit()
            time.sleep(0.1)
def getKeywords(src):
    kkma = Kkma()

    words = kkma.nouns(src)
    words = list(set(words))
    words_calc = []

    words_num = len(words)
    for word in words:
        if not word.isdigit() and not u'서울' in word and re.match('(.*)?\d+(.*)?', word) is None:
            word_count = src.count(word)
            word_idf = word_count * math.log(len(word))
            if word_idf > 1:
                words_calc.append((word, word_idf))

    words_sort = sorted(words_calc, key = lambda w: w[1], reverse = True)
    words_real = []

    for word in words_sort:
        words_real.append(word[0])

    print (" / ".join(words_real[:5])).encode('utf-8')
Exemple #34
0
def SortNoun(filename):
    # Read file
    f = open(filename, 'r')
    text = f.read().decode('utf-8') # read file as utf8 decoded
    f.close()
        
    # tagging
    from konlpy.tag import Kkma
    #from konlpy.utils import pprint
    kkma = Kkma()
    print ('now extracting nouns...')
    tagged = kkma.nouns(text)
    
    # Write tagged file
    (path,fnameExt) = os.path.split(filename)
    (fname,fext) = os.path.splitext(fnameExt)
    tagged_file = fname+'_'+'noun'+fext
    fw = open(tagged_file,'w')
    for line in tagged:
        strs = line.encode('utf-8')
        fw.write(strs+"\n")
    fw.close()
    print '%s is created' % (tagged_file)    
Exemple #35
0
class Crawler:
    def __init__(self):
        self.kkma = Kkma()
        self.conn = sqlite3.connect('yebi.db')
        self.cursor = self.conn.cursor()
        self.count = 20

        reload(sys)
        sys.setdefaultencoding('utf-8')

    def do(self):
        print '트위터 타임라인 탐색 중.'

        for x in TwitterFetcher().get_time_line(self.count):
            user_id = x['user']['id']
            print ''
            print '=' * 80
            print '... @%s: %s' % (x['user']['name'],  x['text'])

            t = (user_id, )
            self.cursor.execute('select count(*) from users where id=?', t)
            count_user = self.cursor.fetchone()[0]

            if count_user == 0: #DB안에 User가 없으면 ( 0 )
                name = x['user']['name']
                screen_name = x['user']['screen_name']
                profile_image = x['user']['profile_image_url_https']
                t = (user_id, name, screen_name, profile_image)
                self.cursor.execute('insert into users values(?, ?, ?, ?)', t)
                self.conn.commit()
                print "... 유저 %s를 User 디비에 추가중" % x['user']['name']

            i = 1

            tweet_id = x['id']
            t = (tweet_id, )
            self.cursor.execute('select count(*) from tweets where id=?', t)
            count_tweets = self.cursor.fetchone()[0]

            print "... 트윗 디비를 검색중"

            if count_tweets == 0:
                print "... 아직 디비에 없어요."
                text = x['text']
                created_at = x['created_at']
                t = (tweet_id, text, created_at, user_id)
                self.cursor.execute('insert into tweets values(?, ?, ?, ?)', t)
                self.conn.commit()
                print '... %s 추가 중' % x['text']

                for n in self.kkma.nouns(x['text']):
                    t = (user_id, n)
                    self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t)
                    count_noun = self.cursor.fetchone()

                    screen_name = x['user']['screen_name']
                    if count_noun is not None:
                        print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." % \
                              (screen_name, n, count_noun[0])

                    if count_noun is None:
                        print "... %s가 명사 \"%s\"를 처음 사용하였습니다." % (screen_name, n)
                        #t = (user_id, n)
                        self.cursor.execute('insert into user_nouns values(?, ?, 1)', t)
                    else:
                        self.cursor.execute('update user_nouns set count=count+1 where user_id=? and noun=?',
                                            t)
            else:
                print "... 이미 디비에 있어요. (그래도 명사를 분석하겠습니다.)"
                for n in self.kkma.nouns(x['text']):
                #     print "...... %s" % n
                    t = (user_id, n)
                    self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t)
                    count_noun = self.cursor.fetchone()

                    screen_name = x['user']['screen_name']
                    if count_noun is not None:
                        print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." \
                              % (screen_name, n, count_noun[0])

            i += 1
Exemple #36
0
__author__ = 'woojin'
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()
pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.'))
pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.'))
pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
Exemple #37
0
for line in data:
    i += 1

    if(line[8].strip().isdigit()):
        obj = {
            'name': line[7].strip(),
            'sum': int(line[8].strip()) * 1000,
            'categories': [
                line[2].strip(),
                line[3].strip(),
                line[4].strip(),
                line[5].strip()
            ]
        }

        words = kkma.nouns(line[7].strip().decode('utf-8'))
        for j, word in enumerate(words):
            words[j] = word.encode('utf-8')

        obj['words'] = words

        print str(i) + ' / 4014'

        output.append(obj)

jsonfile = open('./data/services.json', 'w')

json.dump(output, jsonfile)

print '--- finished ---'
Exemple #38
0
    poets = get_reviews()

    for poet in poets:
        sentences = poet.split('\n')

        for sentence in sentences:
            try:
                c += Counter(kkma.nouns(sentence))
            except NameError:
                c = Counter(kkma.nouns(sentence))
            except:
                pass

#poets = get_poets()
poets = get_reviews()
kkma = Kkma()

for idx, poet in enumerate(poets):
    tags = []
    for noun in kkma.nouns(poet):
        if noun in TAGS:
            tags.append(noun)

    hash_object = hashlib.sha1(poet.encode('utf-8', 'ignore'))
    hex_dig = hash_object.hexdigest()

    results = collection.find_one({'hex':hex_dig})
    if not results:
        document = {'text': poet, 'index': idx, 'tags': tags, 'hex': hex_dig, 'like': 0, 'date': datetime.datetime.utcnow()}
        collection.insert(document)
Exemple #39
0
def get_tags(text, ntags=40, multiplier=1):
    h = Kkma()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'tag': n, 'count': c }\
                for n, c in count.most_common(ntags)]
Exemple #40
0
def excel_noun():

	def excel_write(row_val, column_val, data):
		new_sheet.cell(row = row_val, column = column_val, value="%s" %data)

	wb=load_workbook('reference.xlsx')
	sheetList = wb.get_sheet_names()
	sheet = wb.get_sheet_by_name(sheetList[0])
	row_count = sheet.get_highest_row()
	
	new_sheet = wb.create_sheet(title='extraction')
	
	for i in range(2, row_count):
		if sheet.row_dimensions[i].visible :
			pass
		else :
			excel_write(i,1,'')
			new_sheet.row_dimensions[i].hidden = True
			#new_sheet.row_dimensions[i].outlineLevel = 1
			continue
	
		noun_val = ""
		full_qua = ""

		cellValue_name = sheet.cell(row=i, column=1).value
		cellValue = sheet.cell(row=i, column=2).value

		try :
			QUA = cellValue.count(u'\u201c')
		except :
			continue 

		if QUA != -1:
			if QUA == 1 :
				START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark
				CELL_VALUE_LEN = len(cellValue)

				cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN]
				END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark

				cellValue_final = cellValue_re[0:END_QUA]
				print str(i) + "  "+ cellValue_name + "  "  + cellValue_final

				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, cellValue_final)
				excel_write(i, 3, noun_val)

			elif QUA == 0 :
				#print str(i) + " " + cellValue
				ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark
				ANOTHER_QUA_LEN = len(cellValue)

				another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN]
				ANOTHER_END_QUA = another_cellValue.find("\"")

				another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA]
				#print str(i) + "  " + cellValue_name + "  " + another_cellValue_final
				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(another_cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, another_cellValue_final)
				excel_write(i, 3, noun_val)

			elif QUA > 1 :
				#print str(i) + " " + str(QUA)
				for q in range(0,QUA):
					arr = cellValue.split(u"\u201d")
					arr_start_qua = arr[q].find(u"\u201c") + 1
					arr_len = len(arr[q]) 

					arr_cellValue = arr[q][arr_start_qua:arr_len]

					full_qua = full_qua + arr_cellValue

					kkma = Kkma()
					#pprint (kkma.nouns(cellValue_final))
					s = (kkma.nouns(arr_cellValue))

					for j in range(0,len(s)):
						noun_val = noun_val + s[j].encode('utf-8') + ','
						#print str(i) + " " + arr_cellValue

					excel_write(i, 1, cellValue_name)
					excel_write(i, 2, full_qua)
					excel_write(i, 3, noun_val)

	wb.save('reference.xlsx')
from konlpy.utils import pprint
kkma = Kkma()

from multiprocessing import Pool

port = 46000

context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind('tcp://127.0.0.1:%s' % port)

while True:
	print 'in the loop'
	# Wait for next request from client
	message = socket.recv()
	result = kkma.nouns(message);
	result = ', '.join(result)
	print '------'
	print result
	socket.send_string(result) # for socker.end unicode is not allowed use send_string

"""
string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.'
string2 = u'5학년되니까 학교근처엔 도저히 먹을게없다'
string3 = u'카이리스님께 사과문올립니다'

start = time.time()

pprint(kkma.nouns(string))
pprint(kkma.nouns(string2))
pprint(kkma.nouns(string3))
Exemple #42
0
            # print(soup.prettify())
            # print(soup)
            newsbody = soup.find(id="articleBodyContents")
            # print(newsbody.contents)
            bodystr = ""
            try:
                for child in newsbody.children:
                    if (isinstance(child, NavigableString) and not isinstance(child, Comment)):
                        # print(child.string.strip())
                        bodystr += child.string.strip()

                # 형태소 분석
                kkma = Kkma()
                # pprint(kkma.nouns(bodystr))
                # pprint(kkma.pos(bodystr))
                wordList = kkma.nouns(bodystr)
                print('k : ', k)
                if k == 0:
                    testEntry = wordList
                    testIssueDate = issueDate
                    testTitle = soup.title.string
                    k = k + 1
                else:
                    if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) > int(df[df['날짜'] < issueDate].head(1)['종가'])):
                        print('up')
                        docList.append(wordList)
                        classList.append(1)
                    else:
                        if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) < int(df[df['날짜'] < issueDate].head(1)['종가'])):
                            print('down')
                            docList.append(wordList)
Exemple #43
0
from konlpy.tag import Kkma
from konlpy.utils import pprint

kkma = Kkma()

pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다'));
pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.'));
pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()

string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.'

pprint(kkma.nouns(string))
Exemple #45
0
def excel_noun():

	def excel_write(row_val, column_val, data):
		new_sheet.cell(row = row_val, column = column_val, value="%s" %data)

	wb=load_workbook(REFERENCE_EXCEL)

	sheetList = wb.get_sheet_names()
	sheet = wb.get_sheet_by_name(sheetList[0])
	row_count = sheet.get_highest_row()
	
	new_sheet = wb.create_sheet(title='extraction')
	
	news_info = {}
	
	for i in range(1, row_count):
		noun_val = ""
		full_qua = ""

		cellValue_name = sheet.cell(row=i, column=1).value
		cellValue = sheet.cell(row=i, column=2).value
		cellValue_id = sheet.cell(row=i, column=3).value

		# u201c 'LEFT DOUBLE QUOTATION MARK'
		# u201d 'RIGHT DOUBLE QUOTATION MARK'

		try :
			QUA = cellValue.count(u'\u201c')  # u201c 'LEFT DOUBLE QUOTATION MARK'
		except :
			continue 

		if QUA != -1:
			if QUA == 1 :
				START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark
				CELL_VALUE_LEN = len(cellValue)

				cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN]
				END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark

				cellValue_final = cellValue_re[0:END_QUA]
				#print str(i) + "  "+ cellValue_name + "  "  + cellValue_final

				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id)
				news_info[i]={news_tuple}

				MyPrettyPrinter().pprint(news_info[i])

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, cellValue_final)
				excel_write(i, 3, noun_val)
				excel_write(i, 4, cellValue_id)

			elif QUA == 0 :
				#print str(i) + " " + cellValue
				ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark
				ANOTHER_QUA_LEN = len(cellValue)

				another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN]
				ANOTHER_END_QUA = another_cellValue.find("\"")

				another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA]
				#print str(i) + "  " + cellValue_name + "  " + another_cellValue_final
				kkma = Kkma()
				#pprint (kkma.nouns(cellValue_final))
				s = (kkma.nouns(another_cellValue_final))

				for j in range(0,len(s)):
					noun_val = noun_val + s[j].encode('utf-8') + ','

				news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id)
				news_info[i]={news_tuple}

				MyPrettyPrinter().pprint(news_info[i])

				excel_write(i, 1, cellValue_name)
				excel_write(i, 2, another_cellValue_final)
				excel_write(i, 3, noun_val)
				excel_write(i, 4, cellValue_id)

			elif QUA > 1 :
				#print str(i) + " " + str(QUA)
				for q in range(0,QUA):
					arr = cellValue.split(u"\u201d")

					if arr is not None:
						try :
							arr_start_qua = arr[q].find(u"\u201c") + 1
						except :
							continue

						arr_len = len(arr[q]) 

						arr_cellValue = arr[q][arr_start_qua:arr_len]
						full_qua = full_qua + arr_cellValue

						kkma = Kkma()
						#pprint (kkma.nouns(cellValue_final))
						s = (kkma.nouns(arr_cellValue))

						for j in range(0,len(s)):
							noun_val = noun_val + s[j].encode('utf-8') + ','
							#print str(i) + " " + arr_cellValue

						news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id)
						news_info[i]={news_tuple}

						MyPrettyPrinter().pprint(news_info[i])

						excel_write(i, 1, cellValue_name)
						excel_write(i, 2, full_qua)
						excel_write(i, 3, noun_val)
						excel_write(i, 4, cellValue_id)

	wb.save(REFERENCE_EXCEL)
	nt.saveObjectBinaryFast(news_info, DICT_NEWS_INFO) 
Exemple #46
0
# This is script to test KoNLPy.
# Project started at 01/18/2016. Author by Jaehyun Ahn([email protected])
__author__ = 'Sogo'

from konlpy.tag import Kkma
from collections import Counter

print('Number of lines in document:')
k = Kkma()
f = open('test.txt', 'r')
lines = f.read().splitlines()
nlines = len(lines)
print(nlines)

nouns = [k.nouns(lines[i]) for i in range(0, nlines)]

cnt = Counter()
for i in range(len(nouns)):
    for j in range(len(nouns[i])):
        cnt[nouns[i][j]] += 1
print(cnt.most_common(15))
# let's get words! It's a steal!
print(cnt.most_common(15)[0][0])
print(cnt.most_common(15)[1])