Example #1
0
def generate_wordCloud(text, font_path, extractNum = 15):
    hannanum = Hannanum()
    setFont(font_path)

    ## mask image
    image_mask = np.array(Image.open("./utils/visualize/만세_보노.jpg"))

    cleanText = clean_text(text)
    words = hannanum.nouns(cleanText)
    word_list = flatten(words)
    word_list = pd.Series([x for x in word_list if len(x)>1]) #; print( word_list.value_counts().head(20) )
    stopwordList = ['’','”','‘','·','…','"',"'"]
    wordcloud = WordCloud(font_path=font_path
                        , stopwords=stopwordList
                        , width=800, height=800
                        , mask=image_mask
                        , background_color='white')

    count = Counter(word_list)
    wordcloud = wordcloud.generate_from_frequencies(count)
    array = wordcloud.to_array()

    fig = plt.figure(figsize=(10,10))
    plt.imshow(array, interpolation='bilinear')
    plt.axis("off")
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    string = b64encode(buf.read())
    wcURI = 'data:image/png;base64,' + urllib.parse.quote(string)
    count = count.most_common(extractNum)
    barURI = generate_barchart(count)
    return wcURI, barURI, count
Example #2
0
def test():
    setFont()
    hannanum = Hannanum()
    #DB Connecion    
    # conn = oci.connect("test/[email protected]:32764/xe", charset='utf8')
    conn = oci.connect('test','1234','192.168.0.52:32764/xe', encoding='utf-8')
    df = pd.read_sql('select * from article_sample', conn )
    sample1 = df['ARTICLE_CONTENT'][0].read()
    word = hannanum.nouns(sample1)
    word_list = flatten(word)
    word_list = pd.Series([x for x in word_list if len(x)>1])
    print( word_list.value_counts().head(20) )
    stopwordList = ''
    wordcloud = WordCloud(font_path=setFontPath()
                        , stopwords=stopwordList
                        , width=800, height=800
                        , background_color='white')

    count = Counter(word_list)
    wordcloud = wordcloud.generate_from_frequencies(count)
    array = wordcloud.to_array()

    fig = plt.figure(figsize=(10,10))
    plt.imshow(array, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    # plt.savefig('C:/Users/admin/Documents/IMG04.png', bbox_inches='tight')
Example #3
0
def konlpyHannanum(inputSentence: str, sentenceList: list) -> dict:
    han = Hannanum()
    sentenceDict = dict()

    inputPos = han.pos(inputSentence)
    inputPosCount = Counter(inputPos)
    inputLen = len(inputPosCount)

    for line in sentenceList:
        if line == '':
            continue
        sentencePos = han.pos(line)
        sentencePosCount = Counter(sentencePos)
        sentenceLen = len(sentencePosCount)

        if sentenceLen >= inputLen:
            common = 0
            for morpheme in inputPosCount:
                if morpheme in sentencePosCount:
                    common += min(inputPosCount[morpheme],
                                  sentencePosCount[morpheme])
                    similarity = 100 * common / inputLen
                    sentenceDict[line] = similarity
        else:
            common = 0
            for morpheme in inputPosCount:
                if morpheme in sentencePosCount:
                    common += min(inputPosCount[morpheme],
                                  sentencePosCount[morpheme])
                    similarity = 100 * common / sentenceLen
                    sentenceDict[line] = similarity

    return sentenceDict
Example #4
0
def text_preprocessing_after(lists):
    hannanum = Hannanum()
    getNum = 5
    stopword = ['등', '코', '만', '속보', '최초', '4억', '월요일']
    cleaning = lambda x: hannanum.nouns(wordcloud01.clean_text(x))
    nouns_list = list(map(cleaning, lists))

    # print(nouns_list)

    texts = [value for nouns in nouns_list for value in nouns]
    total_counter = Counter(texts)
    for word in stopword:
        del total_counter[word]
    result = total_counter.most_common(getNum)
    return result


## 명사 빈도 추출. ##################################################
# def nouns_frequency(text):
#     print('Kkma 객체 생성')
#     hannanum = Kkma()
#     print('텍스트 처리중')
#     clean_text = wordcloud01.clean_text(text)
#     print('텍스트 명사 처리중')
#     words = hannanum.nouns(clean_text)
#     print('평평하게 만들기')
#     word_list = wordcloud01.flatten(words)
#     print('판다스 변환중')
#     word_list = pd.Series([x for x in word_list if len(x)>1])
#     print('result Counter 중')
#     result = Counter(word_list)
#     return result
Example #5
0
    def crawl(self, food_detail_df):
        print('[Recipe Web Crawling Start]')
        for index in range(len(food_detail_df)):

            food = food_detail_df.foodName[index]
            recipe = self, recipe_finder(food, 2)
            food_detail_df.loc[index, 'foodRecipe'] = str(recipe)

            if (index + 1) % 5 == 0:
                print(round((index + 1) / len(food_df) * 100, 2),
                      'percent Done')

        print('Complete!!')
        print('')
        print('[noun extract start]')

        food_detail_df['foodRecipeNoun'] = ''
        for i in range(len(food_detail_df)):

            doc = food_detail_df.foodRecipe[i]
            noun = Hannanum().nouns(doc)
            cnt = Counter(noun)
            only_word = []
            for key, value in cnt.items():
                if int(value) < 3:
                    noun.remove(key)
            for word in noun:
                m = re.match('^\D*\D$', word)
                if m:
                    only_word.append(m.group())

            food_detail_df.loc[i, 'foodRecipeNoun'] = str(only_word)
            if (i % 5) == 0:
                print(round(i / len(food_detail_df) * 100, 2), ' perent done')
        print('Complete')
Example #6
0
def lineAnalyzer(sentence, analyzeType):
    hannanum = Hannanum()
    wordList = list()
    if (analyzeType == 1):
        # Nouns
        wordList = hannanum.nouns(str(sentence))
    elif (analyzeType == 2):
        # Morphs
        wordList = hannanum.morphs(str(sentence))
    elif (analyzeType == 3):
        # Bi-grams
        bigram_measures = collocations.BigramAssocMeasures()
        pos = hannanum.pos(str(sentence))
        words = [s for s, t in pos]
        finder = collocations.BigramCollocationFinder.from_words(words)
        finder.apply_word_filter(lambda w: len(w) < 2)
        finder.apply_freq_filter(3)
        wordList = finder.nbest(bigram_measures.pmi, 10)
    elif (analyzeType == 4):
        # Tri-grams
        trigram_measures = collocations.TrigramAssocMeasures()
        pos = hannanum.pos(str(sentence))
        words = [s for s, t in pos]
        finder = collocations.TrigramCollocationFinder.from_words(words)
        finder.apply_word_filter(lambda w: len(w) < 2)
        finder.apply_freq_filter(3)
        wordList = finder.nbest(trigram_measures.pmi, 10)
    else:
        print("error on top!")
    return wordList
Example #7
0
def getSentenceByWord():
    if (request.method == 'POST'):

        hannanum = Hannanum()
        word_data = hannanum.pos(request.form['wordData'])[0][0]

        # print(word_data)
        sentence_dict = {"sentenceId": 0, "sentenceData": "", "standard": ""}
        sentence_id_list = []
        sentence_list = []

        for wd in db_session.query(Word).order_by(
                Word.wordId).filter(Word.wordData == word_data):
            sentence_id_list.append(wd.sentenceId)

        # print(sentence_id_list)
        for sid in sentence_id_list:
            sentence = db_session.query(Sentence).filter(
                Sentence.sentenceId == sid).first()
            sentence_dict["sentenceId"] = sentence.sentenceId
            sentence_dict["sentenceData"] = sentence.sentenceData
            sentence_dict["standard"] = sentence.standard
            sentence_list.append(sentence_dict.copy())

        return json.dumps(sentence_list, ensure_ascii=False)
Example #8
0
def get_tags(text, ntags=50, multiplier=10):
    h = Hannanum()
    nouns = h.nouns(text)
    count = Counter(nouns)
    print(count)
    return [{'color': color(), 'tag': n, 'size': c * multiplier} \
            for n, c in count.most_common(ntags)]
Example #9
0
def test():
    rss_list = [
        # "https://www.reddit.com/",
        "http://www.chosun.com/site/data/rss/politics.xml",
        "http://rss.joins.com/joins_politics_list.xml",
    ]

    hannanum = Hannanum()
    # mecab = Macab()

    for rss_link in rss_list:
        print("Start get_URLs and read files from : " + rss_link)
        start_time = time.time()
        links = get_URLs(rss_link)
        for link in links:
            parse_time = time.time()
            article = get_article(link)
            file = open("./test/%s.txt" % (article.title),
                        'w',
                        encoding="utf8")
            nouns = hannanum.nouns(article.text)
            # nouns = mecab.nouns(article.text)

            for noun in nouns:
                file.write("%s\n" % noun)
            file.close()
            parse_time = time.time() - parse_time
            print("parse files from %s: %f" % (link, parse_time))
        start_time = time.time() - start_time
        print("Process time : %f" % (start_time))
    def generate_summary(self, file_name, index, top_n=5):
        stop_words = read_data(filename='korean_stopwords_list.txt')

        summarize_text = []

        # Step 1 - Read text anc split it
        sentences = self.read_article(file_name, index)

        #token화 추가
        hannanum = Hannanum()
        temp = []
        for sentence in sentences:
            temp.append(hannanum.nouns(' '.join(sentence)))
        # print("temp:",temp)

        # Step 2 - Generate Similary Martix across sentences
        sentence_similarity_martix = self.build_similarity_matrix(
            temp, stop_words)

        # Step 3 - Rank sentences in similarity martix
        sentence_similarity_graph = nx.from_numpy_array(
            sentence_similarity_martix)
        scores = nx.pagerank(sentence_similarity_graph)

        # Step 4 - Sort the rank and pick top sentences
        ranked_sentence = sorted(
            ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        for i in range(top_n):
            summarize_text.append(" ".join(ranked_sentence[i][1]))

        # Step 5 - Offcourse, output the summarize text
        print("\nSummarize Text: \n", ". ".join(summarize_text))
Example #11
0
def get_string(path):
    f = open(path, "r", encoding="utf-8")
    sample = f.read()
    f.close()
    h = Hannanum()
    list_nouns = h.nouns(sample) #get list of nouns from sample
    return listToString(list_nouns) #get string of list_nouns
Example #12
0
    def make_corpus(self):
        processor = Hannanum()
        with open('outputs/sermon-{}-corpus.txt'.format(self.name),
                  'w') as fout:
            data_dir = 'data/{}'.format(self.name)
            for filename in os.listdir(data_dir):
                if not filename.endswith('txt'):
                    continue

                path = os.path.join(data_dir, filename)
                with open(path) as fin:
                    print(path)
                    for line in fin:
                        _line = self.clean_punctuation(line)
                        if not _line:
                            continue

                        _lines = _line.split('.')

                        for l in _lines:
                            _l = self.clean_punctuation(l)
                            if not _l:
                                continue
                            sentence = [
                                '{}/{}'.format(word, tag)
                                for word, tag in processor.pos(_l) if
                                self.filter_tag(tag) and self.filter_word(word)
                            ]
                            if len(sentence) > 2:
                                fout.write(' '.join(sentence) + '\n')
Example #13
0
def tag_article():
    """
    주기 : 하루
    작업 : 게시물의 명사를 기준으로 테그한다.
    """
    from konlpy.tag import Hannanum
    hannanum = Hannanum()

    for article in Article.objects.all():
        try:
            tags = [
                tag for tag in hannanum.nouns(article.title) if len(tag) > 1
            ]

            for tag in tags[:10]:
                splits = re.split(r',', tag)

                tags.remove(tag)

                if len(splits) > 1:
                    for split in splits:
                        tags.append(split.strip(SPLIT_CHAR))
                else:
                    tags.append(tag.strip(SPLIT_CHAR))

            article.do_tag(tags)

        except Exception as e:
            print(hannanum.nouns(article.title))
            print(e)
Example #14
0
def hannanum(sentence):
    h = Hannanum()

    if jpype.isJVMStarted():
        jpype.attachThreadToJVM()

    return h.analyze(sentence)
Example #15
0
    def NLP(self, food_detail_df):

        print('[noun extract start]')

        food_detail_df['foodRecipeNoun'] = ''
        for i in range(len(food_detail_df)):

            doc = food_detail_df.foodRecipe[i]
            noun = Hannanum().nouns(doc)

            for word in noun:
                word = word.replace('ㅎ', '').replace('ㅋ', '').replace(
                    'ㅜㅜ', '').replace('ㅠㅠ', '').replace('\\n', '')

            cnt = Counter(noun)
            only_word = []
            for key, value in cnt.items():

                #if (len(key) < 2)|(len(key) > 6):
                #noun.remove(key)

                if int(value) < 3:
                    noun.remove(key)

            for word in noun:

                m = re.match('^\D*\D$', word)
                if m:
                    only_word.append(m.group())

            food_detail_df.loc[i, 'foodRecipeNoun'] = str(only_word)
            if (i % 5) == 0:
                print(round(i / len(food_detail_df) * 100, 2), ' perent done')
        print('Complete')
Example #16
0
 def __init__(self):
     self.komoran = Komoran()
     self.kkma = Kkma()
     self.hann = Hannanum()
     self.mecab = Mecab()
     self.twitter = Twitter()
     self.okt = Okt()
Example #17
0
def get_derived_query(keyword):

    # google translate API
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "My Project-a8e42c74ea7e.json"
    translate_client = translate.Client()

    # Hannanum pos tagger
    hannanum = Hannanum()
    """ Retrieve derived queries from keyword by using WordNet Synset """
    nouns = [word for word, pos in hannanum.pos(keyword) if pos == "N"]
    syn_dict = {}
    query_list = []

    for noun in nouns:
        result = translate_client.translate(noun, target_language="en")
        if len(result["translatedText"].split(" ")) > 1:  # 복합 명사 처리 안함
            continue
        else:
            translated_noun = result["translatedText"]
            # print(noun, translated_noun)
            for syn in wordnet.synsets(translated_noun):
                synonyms = []
                if syn.pos() == "n":
                    syn_word = syn.name().split(".")[0]
                    synonyms.append(syn_word)

        syn_dict[noun] = synonyms

    if len(syn_dict) > 0:
        for noun in syn_dict:
            for syn in syn_dict[noun]:
                syn_ko = translate_client.translate(syn, target_language="ko")["translatedText"]
                query_list.append(keyword.replace(noun, syn_ko))

    return list(np.unique(query_list))
    def max_similarity(self):
        konlpy = Hannanum()
        l = konlpy.nouns(self.lyrics_input)
        song_list = self.song_list
        song_id = 0
        max_similarity = 0.0

        result = self.compare_lyrics()
        if result > 0:
            return result

        print("입력된 가사의 단어 배열: ", l)
        for song in song_list:
            if song['words'] is None:
                song['words'] = konlpy.nouns(song['lyrics'])

            print("song_id, title: ", song['song_id'], song['title'])
            temp = self.measure_similarity(l, song['words'])
            print("코사인 유사도: ", temp)
            print()
            if temp > max_similarity:
                song_id = song['song_id']
                # title 출력을 원한다면 주석해제
                # title = song['title']
                max_similarity = temp

        # title 출력을 원한다면 주석해제
        return song_id  # , title
Example #19
0
def preprocess_npm(document):
    ''' 한국어 체언(n), 용언(p), 수식언(m) 추출해 문장별로 단어 리스트로 정리'''
    sentences = re.split('\.\s+', document.strip())
    results = []
    for sent in sentences:
        try:
            # 국어영어 아닌 문자 제거
            letters_only = re.sub('[^ㄱ-힣a-zA-Z]', ' ', sent)
            # 형태소 분석기 불러와 단어 분리
            hannanum = Hannanum()
            morp_words = hannanum.pos(letters_only)
            # 특정 형태소 단어만 선택. 용언의 경우 '다'를 붙여 기본형으로.
            morph_words = []
            for w in morp_words:
                if w[1] in ['N', 'M']:  # N 체언 P 용언 M 수식언
                    morph_words.append(w[0])
                elif w[1] == 'P':
                    morph_words.append(w[0] + "다")
                else:
                    pass
            # stopwords 적용해 불용어 제거
            stopwords = [
                '특파원', '기자', '단독', '앵커', '취재', '특종', '신문', '방송', '보도', '외신',
                '뉴스'
            ]  # 필요한 불용어 추가
            meaningful_words = [w for w in morph_words if w not in stopwords]
            # 2음절 이상만 선택
            meaningful_words2 = [w for w in meaningful_words if len(w) > 1]
            results.append(meaningful_words2)
        except:
            results.append([''])
    return results
def divide_with_morpheme(raw_data, total=1):
    data = []
    hannanum = Hannanum()
    if total == 1:
        for itr, article in enumerate(raw_data):
            for sentence in article:
                #print(sentence)
                if sentence != '':
                    pos_result = hannanum.morphs(sentence)
                    tmp = " ".join(pos_result)
                    data.append(tmp)
            print(str(itr)+ 'th article processed')
            print('last sentence : ' + tmp)
        return data
    elif total ==0 :
        for itr, article in enumerate(raw_data):
            tmp_data = []
            for sentence in article:
                #print(sentence)
                if sentence != '':
                    pos_result = hannanum.morphs(sentence)
                    tmp = " ".join(pos_result)
                    tmp_data.append(tmp)
            print(str(itr)+ 'th article processed')
            print('last sentence : ' + tmp)
            data.append(tmp_data)
        return data
Example #21
0
def get_tags(text, ntags=50, multiplier=10):
	h = Hannanum()
	nouns = h.nouns(text)
	count = Counter(nouns)

	# for word,cnt in count.most_common(ntags):
	#	print(word,cnt)
	return count
def review_preprocessing(data):
    # Hannanum package
    pos_tagger = Hannanum()

    # 뉴스를 tokenizing한 후, 명사만 추출
    pos_nouns = pos_tagger.nouns(data)

    return ' '.join(pos_nouns)
Example #23
0
def text_mining(title_list, ntags=50, multiplier=1):
    h = Hannanum()
    data_nouns = []
    for title in title_list:
        data_nouns.extend(h.nouns(title))
    
    count = Counter(data_nouns)
    
    return [{'color': color(),'tag':n,'size':int(c*multiplier*0.5)} for n,c in count.most_common(ntags)]
def extract_pos(text):
    h = Hannanum()
    pos = h.pos(text, ntags=22, flatten=True)
    pos_list = [item for item in pos if item[1] == 'NC' or item[1] == 'NQ' or item[1] == 'NN' or item[1] == 'PV' or item[1] == 'PA']
    dct = dict(pos_list)
    for stopword in stopwords.itertuples(): # 불용어 체크
        if dct.get(stopword._1):
            del dct[stopword._1]
    split_pos = "|".join("%s,%s" % tup for tup in dct.items())
    return split_pos
Example #25
0
def comment_freq(youtube_data):
    # youtuber_csv_data = dm.GetData(url, con)
    # if youtuber_csv_data == None:
    #     print("데이터 없음")
    #     return None
    # video_num = int(input("몇 번 동영상을 분석할까요 ? "))
    # youtube_data = dm.GetData(youtuber_csv_data[video_num][0], password) >> main.py에서 구현
    if youtube_data == None:
        return None
    comment = []
    for i in range(len(youtube_data)):
        comment.append(youtube_data[i][2])

    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d]')

    comment_noemot = []
    for i in comment:
        tokens = re.sub(emoji_pattern, "", i)
        tokens = re.sub(han, "", tokens)
        comment_noemot.append(tokens)

    nouns = []
    h = Hannanum()

    for i in comment_noemot:
        n = h.nouns(i)
        nouns.append(n)

    noun_list = []
    for i in range(len(nouns)):
        for j in range(len(nouns[i])):
            noun_list.append(nouns[i][j])

    counts = Counter(noun_list)
    tags = counts.most_common(30)

    wc = WordCloud(font_path='C:\\Windows\\Fonts\\gulim.ttc',
                   background_color='black',
                   width=800,
                   height=600)

    cloud = wc.generate_from_frequencies(dict(tags))
    cloud
    plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud)
    plt.show()
Example #26
0
def sentiment_analysis(tweet, tweets):
    #1) 형태소로 나눈다.
    content = tweet[1]
    content_morphs = []
    hannanum = Hannanum()
    content_morphs = hannanum.morphs(content)
    #   print("형태소 분류: ", content_morphs)

    #1)-2 띄어쓰기로 나눈다.
    space = content.split(" ")
    print(space)

    #2) 불용어 제거하기
    # 2) -1 불용어 불어오기 stopwords => 불용어 리스트
    stopwords_file = open("불용어.txt", 'r', encoding='utf-8')
    stopwords = []
    lines = stopwords_file.readlines()
    for line in lines:
        line = line.replace("\n", "")
        stopwords.append(line)

    # 2) -2 불용어 제거하기
    for i in content_morphs:
        if i in stopwords:
            content_morphs.remove(i)


#    print("불용어 제거: " ,content_morphs)

#3) 형태소별 극성 계산
# data: 감성사전
    with open('data/SentiWord_info.json', encoding='utf-8-sig', mode='r') as f:
        data = json.load(f)

    score = 0
    for wordname in space:
        for i in range(0, len(data)):
            #어근 비교 및 단어 비교를 같이 한다.
            if (data[i]['word_root'] == wordname) or (data[i]['word']
                                                      == wordname):
                if data[i]['polarity'] != "None":
                    score += int(data[i]['polarity'])
                    break
        if score > 0:
            polarity = "positive"
        elif score == 0:
            polarity = "neutral"
        else:
            polarity = "negative"

    tweet[4] = polarity
    #    hashtag_sentiment_analysis(tweet, tweets)
    tweets.append(tweet)
    print("content: ", content)
    print("polarity: ", polarity)
Example #27
0
 def __init__(self, tech_words, double_words, triple_words, syns_words):
     self.puctuation = re.compile('[!"$%&\'()*,-/:;<=>?@[\\]^_`{|}~]')
     self.hannanum = Hannanum()
     with open(tech_words, 'rb') as f:
         self.tech_words = pickle.load(f)
     with open(double_words, 'rb') as f:
         self.double_words = pickle.load(f)
     with open(triple_words, 'rb') as f:
         self.triple_words = pickle.load(f)
     with open(syns_words, 'rb') as f:
         self.syns_words = pickle.load(f)
Example #28
0
    def __init__(self):
        self.parser = reqparse.RequestParser()
        print("LoadSrcFile init")
        self.parser.add_argument("group_path", type=str, location="json")

        self.token_manager = TokenManager.instance()
        print("self.parser.parse_args() : ", self.parser.parse_args())
        self.group_path = self.parser.parse_args()["group_path"]
        self.t = Okt()
        self.ha = Hannanum()
        super(LoadSrcFile, self).__init__()
def tokenization(cleaned_docs):
    han = Hannanum()
    tokenized_docs = []
    while ' ' in cleaned_docs:
        cleaned_docs.remove(' ')
    for doc in cleaned_docs:
        nouns_in_doc = []
        for noun in han.nouns(doc):
            if len(noun) > 1: nouns_in_doc.append(noun)
        tokenized_docs.append(nouns_in_doc)
    return tokenized_docs
def update_words_all():
    hannanum = Hannanum()
    db = db_connector.DbConnector()
    song_list = db.select_all()
    for song in song_list:
        if song['lyrics'] is not None and song['words'] is None:
            words = hannanum.nouns(song['lyrics'])
            words = sorted(set(words))
            update_words(song['song_id'], ' '.join(words))

    print('Words extraction done!')
Example #31
0
    def wordAnalysis(text):
        myHannanum = Hannanum()

        print("text : " + text)

        replace_text = re.sub("[!@#$%^&*()_+]", " ", text)

        print("replace_text : " + replace_text)

        analysis_text = (" ".join(myHannanum.nouns(replace_text)))

        return analysis_text
Example #32
0
 def reduceToWords(self):
     hannanum = Hannanum()
     words = ''
     #for word in hannanum.nouns(unicode(texts, 'UTF-8')):
     if (self.result != ''):
         for word in hannanum.nouns(self.result):
             word = re.sub("[(*&]", "", word)
             if (len(word) > 1): words = word + '\n' + words
         #for end
         self.result = words
         print words
     # if end
     return self
Example #33
0
 def reduceToWords(self) :
   hannanum = Hannanum()
   words = ''
   #for word in hannanum.nouns(unicode(texts, 'UTF-8')):
   if(self.result != '') :
     for word in hannanum.nouns(self.result):
       word = re.sub("[(*&]", "", word)
       if(len(word) > 1): words = word + '\n' + words
     #for end
     self.result =  words
     print words
   # if end
   return self
 def __init__(self):
     """Initialize unigram class
     this method initialize all attributes of this class
     """
     self.compare_set = []
     self.hannanum = Hannanum()
     self.ke_object = KE()
     self.set = ['pure', 'pure_number', 'pure_punctuation', 'pure_number_punctuation']
Example #35
0
def hannanum_analyze(content):
	dictionary={}
	h= Hannanum()
	words = h.pos(content)
	for tuple in words:
		value = tuple[1]
		if value == "N" or value == "P" or value == "M":
			key = tuple[0]
			# '먹' 같은 용언에는 '-다'를 붙여 '먹다' 같은 동사로 키값 사용
			if value == "P":
				key += u"다"

			if not key in dictionary.keys():
				dictionary[key] =1
			else :
				dictionary[key] +=1
	return OrderedDict(sorted(dictionary.items(), key=itemgetter(1), reverse=True))
Example #36
0
    def __init__(self, on_han=False, on_twitter=False, on_mecab=False):    # maybe move to init of analysis_app

        """
        Allocate kkma or twitter diction instance
        :param on_han: han instance
        :param on_twitter: twitter instance
        :param on_mecab: mecab instance
        """
        if on_han is True:
            self.han = Hannanum()
        if on_twitter is True:
            self.twitter = Twitter()
Example #37
0
def hannanum_analyze_22():
	
	h= Hannanum()
	tags={
		'NC':'보통명사',
		'NQ':'고유명사',
	}	
	"""
	pos_dics=
	{
		news.id : 분석 결과 dictionary,
		news.id : 분석 결과 dictionary,
		...
	}
	"""
	pos_dics = {}
	news = getTodayNews()
	
	for n in news :
		content = remove_puc_marks(n.content) # 문장 부호 제거
		words_dic = h.pos(content,22)			# 형태소 제거
		dictionary={}
		for t in words_dic:
			word = t[0]
			key = t[1]
			if key in tags.keys():
				if not word in dictionary.keys():
					dictionary[word] =1
				else :
					dictionary[word] +=1

		dictionary=remove_stopwords(dictionary)	# 불용어 제거
		pos_dics[n]=dictionary

	print "tf-idf"
	analyzed_dics=tf_idf_map(pos_dics) 			# tfidf 계산

	return analyzed_dics
Example #38
0
def hannanum_analyze_22_key(content):
	dictionary={}
	h= Hannanum()
	tags={
		'NC':'보통명사',
		'NQ':'고유명사',
		# 'NB':'의존명사',	
		# 'NN':'수사' ,
		# 'NP':'대명사' , 
		# 'PV':'동사', 
		# 'PA':'형용사',
		# 'PX':'보조 용언',
		# 'MM':'관형사' , 
		# 'MA':'부사',
	}	
	words = h.pos(content,22)
	
	for t in words:
		key = t[0]
		value = t[1]
		
		if value in tags.keys():
			# '먹' 같은 용언에는 '-다'를 붙여 '먹다' 같은 동사로 키값 사용
			if value.startswith("P"):
				key += u"다"
			key= key+ "["+value+"]"

			if not key in dictionary.keys():
				dictionary[key] =1
			else :
				dictionary[key] +=1
			# print key + " " + value

	dictionary=remove_stopwords(dictionary)	# 불용어 제거
	dictionary=OrderedDict(sorted(dictionary.items(), key=itemgetter(1), reverse=True))
	
	return dictionary
 def get_tags(self,text, ntags=10, multiplier=10):
     h = Hannanum()
     nouns = h.nouns(text)
     count = Counter(nouns)
     return [{'tag': n, 'size': c*multiplier }\
                 for n, c in count.most_common(ntags)]
Example #40
0
    def analysis(self, blog_review_url):
        # self.logger.info(blog_review_url)

        analysis_checker = {}

        try:
            r = requests.get(blog_review_url)
        except requests.ConnectionError as e:
            return

        soup = BeautifulSoup(r.text)
        r.close()

        try:
            blog_review_url = soup.select('#screenFrame')[0]['src']
            # self.logger.info("regenerated:"+blog_review_url)
            r = requests.get(blog_review_url)
            soup = BeautifulSoup(r.text)
            r.close()
        except Exception as e:
            pass

        try:
            real_blog_review_url = "http://blog.naver.com" + soup.select('frame#mainFrame')[0]['src']
        except IndexError as e:
            self.skip_count += 1
            return

        r = requests.get(real_blog_review_url)
        soup = BeautifulSoup(r.text)
        r.close()

        post_view = soup.select('.post-view')[0]
        p_list = post_view.select('p')

        raw_str_list = []
        for item in p_list:
            p_str = str(item.text.encode('utf-8')).replace('\xc2\xa0', ' ').replace('\xe2\x80\x8b', ' ').strip()
            p_str = p_str.replace('ㅎ', '').replace('ㅋ', '')
            if len(p_str) != 0:
                raw_str_list.append(p_str.decode('utf-8'))

        kkma = Hannanum()

        for raw_str_item in raw_str_list:
            if len(raw_str_item) >= 100:
                self.skip_count += 1
                continue

            try:
                raw_str_item = raw_str_item.strip()
                pos_tuple = kkma.pos(raw_str_item)
                for pos_tuple_item in pos_tuple:
                    item = pos_tuple_item[0]
                    item_type = pos_tuple_item[1]

                    if not (analysis_checker.has_key(item)) and (
                                    item_type.startswith('N') or item_type.startswith('V') or item_type.startswith(
                                    'M') or item_type.startswith('XR') or item_type.startswith('U')):
                        if self.analysis_result.has_key(item):
                            analysis_item_count = self.analysis_result.get(item) + 1
                        else:
                            analysis_item_count = 1

                        self.analysis_result.update({
                            item: analysis_item_count
                        })

                        analysis_checker.update({
                            item: 1
                        })
            except jpype.JavaException as exception:
                pass
def get_tags(text, ntags=int(sys.argv[2]), multiplier=10):
    h = Hannanum()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\
                for n, c in count.most_common(ntags)]
Example #42
0
def get_tags(text, ntags=50, multiplier=10):
    h = Hannanum()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{"color": color(), "tag": n, "size": c * multiplier} for n, c in count.most_common(ntags)]
Example #43
0
def WordCount(corpus):
    h = Hannanum()
    nouns = h.nouns(corpus)
    frequency = Counter(nouns)
    return frequency
Example #44
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_han=False, on_twitter=False, on_mecab=False):    # maybe move to init of analysis_app

        """
        Allocate kkma or twitter diction instance
        :param on_han: han instance
        :param on_twitter: twitter instance
        :param on_mecab: mecab instance
        """
        if on_han is True:
            self.han = Hannanum()
        if on_twitter is True:
            self.twitter = Twitter()
        # if on_mecab is True:
        #     self.mecab = Mecab()

    def analyzer_hannaum(self, string_data, mode):
        """
        This method is for hannanum. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._hannanum
        """
        if mode is 'morphs':
            return self.han.morphs(string_data)
        elif mode is 'nouns':
            return self.han.nouns(string_data)
        elif mode is 'pos':
            return self.han.pos(string_data)
        else:
            return False

    def analyzer_mecab(self, string_data, mode):
        """
        This method is for mecab. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#mecab-class
        """
        if mode is 'morphs':
            return self.mecab.morphs(string_data)
        elif mode is 'nouns':
            return self.mecab.nouns(string_data)
        elif mode is 'pos':
            return self.mecab.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
class Word(object):
    """Analyze pharse set by word
    Attributes:
        compare_set (dict): for correlation calculation
        hannanum (object): for morphere analyzing
        ke (object): for korean analying
        set (list): 4 parent sets
    """
    def __init__(self):
        """Initialize unigram class
        this method initialize all attributes of this class
        """
        self.compare_set = []
        self.hannanum = Hannanum()
        self.ke_object = KE()
        self.set = ['pure', 'pure_number', 'pure_punctuation', 'pure_number_punctuation']

    def analyze(self, analyze_path, output_path, filename):
        """Analyze pharse set from target filename by word
        Args:
            analyze_path (str): target input file's path
            output_path (str): output file's path
            filename (str): target filename
        """
        input_file = filename + '.txt'
        output_file = 'word_analyze_' + filename + '.txt'

        with open(analyze_path + input_file, 'r') as file_read:
            word_list = []
            for line in file_read:
                if len(line.strip()) == 0:
                    continue
                #self.hannanum.morphs(line.decode('utf-8'))
                #for word in line.decode('utf-8').split(' '):
                try:
                    morphs_list = self.hannanum.morphs(line.decode('utf-8'))
                except UnicodeDecodeError as e:
                    morphs_list = self.hannanum.morphs(line.encode('utf-8'))

                for word in morphs_list:
                    changed = self.ke_object.change_complete_korean(word, 3)
                    word_item = "".join(changed)
                    word_list.append(word_item)
                    if filename in self.set:
                        if not word_item in self.compare_set:
                            self.compare_set.append(word_item)

            #self.copy_set_file(filename, word_list)
            final = self.update_dict(word_list)

            if not filename in self.set:
                for key in self.compare_set:
                    if not key in final.keys():
                        final[key] = 0

            with open(output_path + output_file, 'w') as file_write:
                for key, value in final.iteritems():
                    file_write.write(str(key) + ' : ' + str(value) + '\n')

    def copy_set_file(self, filename, result):
        """Copy parent set file from filename for compare set
        Args:
            filename (str): target filename
            result (list): target result
        """
        if filename in self.set:
            self.compare_set = copy.deepcopy(result)

    @staticmethod
    def update_dict(result):
        """generate dict for correlation calc as analyze result
        Args:
            result (list): target result
        Return:
            final (dict): calculated dict
        """
        final = {}
        for item in result:
            if item in final.keys():
                updated = final[item]
                del final[item]
                final[item] = updated + 1
            else:
                final[item] = 1
        return final