Beispiel #1
0
def morphing(content):
    mecab = Mecab()
    morphList = []
    for word in mecab.nouns(content):
        if word not in stop_word:
            morphList.append(word)
    return morphList
Beispiel #2
0
def callback(ch, method, properties, body, id):
    mecab = Mecab()
    print(" [%d] Received %s" % (ch, body.decode('utf-8')))

    noun_list = mecab.nouns(body.decode('utf-8'))

    send_message_to_database(id, noun_list)
Beispiel #3
0
def count(request):
    full_text = request.GET['fulltext']

    tagger = Mecab()  #형태소 분석기

    word_list = tagger.nouns(full_text)  #명사만 추출하기 return은 dict형태

    #word_list = full_text.split()

    word_dictionary = {}

    for word in word_list:
        if Words.objects.filter(text=word):
            word = Words.objects.get(text=word)
            word.frequency_total += 1
            word.save()
        else:
            word = Words(text=word, frequency_total=1)
            word.save()
        if word in word_dictionary:
            word_dictionary[word] += 1
        else:
            word_dictionary[word] = 1

    return render(
        request, 'wordcount/count.html', {
            'fulltext': full_text,
            'total': len(word_list),
            'word_dictionary': word_dictionary.items()
        })
def clean_str(s):
    """Clean sentence"""
    global counter_konlpy
    global total_dataset
    #global stopwords
    s = re.sub('[0-9]', '', s)
    s = preprocess(s)

    mecab = Mecab()
    #print(' '.join(kkma.nouns(s)))
    result = []
    result = mecab.nouns(s)
    #temp = []
    #temp = mecab.nouns(s)
    #for noun in temp:
    #flag = 0;
    #for sword in stopwords:
    #if noun == sword:
    #flag = 1;
    #break;
    #if flag == 0:
    #result.append(noun)

    if len(result) > 1000:
        result = result[0:1000]
    counter_konlpy += 1
    #sys.stdout.write("\rParsed: %d / %d" %(counter_konlpy, total_dataset))
    #sys.stdout.flush()
    return ' '.join(result)
Beispiel #5
0
def preprocess(datapath):
    mecab = Mecab()
    cnt = 0
    sentences = []
    f = open('./data/korean_corpus.txt', 'r', encoding="utf8")
    while True:
        line = f.readline()
        if not line: break
        cnt += 1
        if not (cnt % 1000):
            print("tokenize {}kth line...".format(cnt // 1000), end='\r')
        tokens = mecab.nouns(line)
        if tokens: sentences.append(tokens)
    print("")
    cnt = 0
    with open(datapath, 'w') as f:
        for sentence in sentences:
            cnt += 1
            for idx, word in enumerate(sentence):
                if idx == len(sentence) - 1:
                    f.write("%s.\n" % word)
                else:
                    f.write("%s " % word)
            if not (cnt % 1000):
                print("write {}kth line to the file...".format(cnt // 1000),
                      end='\r')
    print("")
def news_makefile():
    global links
    global num
    global newstext
    article = newspaper.Article(links[num], language='ko')
    article.download()
    article.parse()
    newstext = article.text
    engine = Mecab()
    nouns = engine.nouns(newstext)
    nouns = [n for n in nouns if len(n) > 1]
    count = Counter(nouns)
    tags = count.most_common(20)
    text = " 제목은  " + article.title + " 입니다.  " + "키워드는  " + str(
        tags[0][0]) + "  " + str(tags[1][0]) + "  " + str(
            tags[2][0]) + "  " + str(tags[3][0]) + "  " + str(tags[4][0])
    tts = gTTS(text + "입니다.  이 기사를 읽으려면 2번, 다음 기사의 키워드는 3번, 분야선택(홈)은 1번 입니다",
               lang='ko')
    if os.path.isfile('keyword.mp3') == True:
        os.remove('keyword.mp3')
    tts.save('keyword.mp3')
    wc = WordCloud(font_path='c:\\windows\\fonts\\NanumSquareR.ttf',
                   background_color='white',
                   width=500,
                   height=400)
    cloud = wc.generate_from_frequencies(dict(tags))
    fig = plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud)
    fig.savefig('keyword.jpg')
    Image.open('keyword.jpg').resize((700, 650)).save('keyword.jpg')
def news_makefile():
    global links
    global num
    global news_text
    article = newspaper.Article(links[num], language='ko')
    article.download()
    article.parse()
    news_text = article.text
    headline = article.title
    engine = Mecab()
    nouns = engine.nouns(news_text)
    nouns = [n for n in nouns if len(n) > 1]
    count = Counter(nouns)
    tags = count.most_common(15)
    print(headline, tags)
    text = " 제목은  " + headline + " 입니다.  " + "키워드는 " "  " + str(
        tags[0][0]) + str(tags[1][0]) + str(tags[2][0]) + str(
            tags[3][0]) + str(tags[4][0])
    tts = gTTS(text + "입니다.  이 기사를 읽으려면 5번, 다음 기사의 키워드는 6번, 분야선택(홈)은 0번 입니다",
               lang='ko')
    tts.save('keyword.mp3')
    font_path = 'c:\\windows\\fonts\\NanumGothic.ttf'
    wc = WordCloud(font_path=font_path,
                   background_color='white',
                   width=500,
                   height=400)
    cloud = wc.generate_from_frequencies(dict(tags))
    fig = plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud)
    fig.savefig('keyword.jpg')
    image = Image.open('keyword.jpg').resize((500, 400)).save('keyword.jpg')
    tts2 = gTTS("선택하신 기사 내용은   " + news_text, lang='ko')
    tts2.save("news_all.mp3")
Beispiel #8
0
    def find_most_mentioned(self):
        """
        Returns how many times that people has been mentioned in conversation
        :return: dictionary containing person's name and number of mentions
        """
        self._rewind()

        # get all conversations
        all_conversations = self.get_all_conversations()

        # word parser objects
        mecab = Mecab()
        kkma = Kkma()

        # parse all conversation words, and get only nouns
        all_nouns = list()
        for conversation in all_conversations:
            all_nouns += mecab.nouns(conversation)

        # exclude family name(성) from name
        names_list = list()
        for name in self.get_all_names():
            preprocessed_name = kkma.nouns(name)
            for data in preprocessed_name:
                if len(data) != 1:
                    names_list.append(data)

        # compare two list
        mentioned_people = [
            person for person in all_nouns if person in names_list
        ]

        # count using Counter and return
        cnt = Counter(mentioned_people)
        return cnt.most_common(len(cnt))
Beispiel #9
0
def get_corpus(data):
    """Make corpus with string or list data

    :param str,list data: String Data (One post per line) | List Data (One post per element)
    :return: corpus (numpy.ndarray)

    """
    nlp = Mecab(
        '/usr/local/lib/mecab/dic/mecab-ko-dic')  # Make Mecab (NLP) Instance

    if type(data) == list:
        post_list = data  # Case: DataType is list and Each element is string
    else:
        post_list = data.split('\n')  # Split string data by line (\n)

    cleaned_post_list = []
    for post in post_list:
        # Cleansing Process (Only get nouns)
        post = re.sub(
            r'\W', ' ', post
        )  # Change Special Characters and blanks (Not words) to ' '(one blank)
        post = ' '.join(nlp.nouns(
            post))  # nlp.nouns returns nouns list. so, unpacking list (to str)
        if post:  # if post is null, Not add
            cleaned_post_list.append(
                post)  # Append cleaned post(only nouns) to cleaned_post_list

    corpus = np.array(cleaned_post_list)
    return corpus
Beispiel #10
0
class SentenceTokenizer(object):

    def __init__(self):
        try: self.mecab = Mecab() 
        except: self.mecab = Mecab(dicpath="C:/mecab/mecab-ko-dic")

        self.stopwords = ['뉴스','연합', '자료사진','서울연합','중인' ,'만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자"
            ,"아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가"]
    def text2sentences(self, text):
        sentences = sent_tokenize(text)
        res = list() 
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                tmp = sentences[idx-1] + (' ' + sentences[idx])
                sentences[idx] = ''
                if '.' in tmp: 
                    dot_idx = tmp.index('.')
                    if dot_idx < len(tmp)-1 and (not tmp[dot_idx+1].isnumeric() or tmp[dot_idx+1] != ' '): 
                        res += tmp.split('.') 
                    else: res.append(tmp) 
                else: res.append(tmp)

        pre_sentences = [elem for elem in res if len(elem) >= 1]
        return pre_sentences

    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence is not '':
                nouns.append(' '.join([noun for noun in self.mecab.nouns(str(sentence))
                                        if noun not in self.stopwords and len(noun) > 1]))

        return nouns
def draw_wordcloud_from_url(url_link):
    # 웹 페이지를 가져온 뒤 BeautifulSoup 객체로 만듦
    response = requests.get(url_link)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', { 'class': 'gall_list' })    # <table class="table_develop3">을 찾음
    links = []                            # 링크를 저장할 리스트 생성
    news_text = ''
    for tr in table.find_all('tr', class_="ub-content"):      # 모든 <tr.ub-content> 태그를 찾아서 반복(각 지점의 데이터를 가져옴)
        title = tr.find('td', class_="gall_tit")
        link = title.find('a').text
        links.append(link)

        article = newspaper.Article(link, language='ko')
        article.download()
        article.parse()
        news_text += article.text

    # konlpy, Mecab: 형태소 분석을 통해 본문에서 명사추출, 1글자는 단어는 삭제
    engine = Mecab()
    nouns = engine.nouns(news_text)
    nouns = [n for n in nouns if len(n) > 1]

    # Counter: 단어수 세기, 가장 많이 등장한 단어(명사) 40개
    count = Counter(nouns)
    tags = count.most_common(40)

    # WordCloud, matplotlib: 단어 구름 그리기
    font_path = '/usr/share/fonts/truetype/nanum/NanumMyeongjoBold.ttf'
    wc = WordCloud(font_path=font_path, background_color='white', width=800, height=600)
    cloud = wc.generate_from_frequencies(dict(tags))
    plt.figure(figsize=(10,8))
    plt.axis('off')
    plt.imshow(cloud)
Beispiel #12
0
def get_clean_word(words, stopwords):
    nouns = []
    #문장들을 받아서 제외시킬 문자만 빼고 리스트에 추가해 반환한다.
    tagger = Mecab(dicpath="C:\\mecab\\mecab-ko-dic")
    for post in words:
        for noun in tagger.nouns(post):
            if noun not in stopwords:
                nouns.append(noun)
    return nouns
Beispiel #13
0
def ko_lemmatize_nouns(inputString):
    '''
        Input:  string (Korean)
        Output: list of strings (Korean)
    ----------------------------------------------------------------------------
    Returns list of nouns from the input.
    '''
    mecab = Mecab()
    return mecab.nouns(inputString)
Beispiel #14
0
 def lematization(self, texts):
     print(' ...Make lematization...')
     mecab = Mecab()
     texts_out = []
     for sent in tqdm(texts):
         doc = " ".join(sent)
         texts_out.append(mecab.nouns(doc))
     # print(texts_out[0])
     return texts_out
Beispiel #15
0
class KoreaHelper(object):
    def __init__(self):
        from konlpy.tag import Mecab
        self.mecab = Mecab()

    def pos(self, phrase: Text):
        """
        $ python -m sagas.ko.ko_helper pos '계획이'
        :param phrase:
        :return:
        """
        return self.mecab.pos(phrase)

    def nouns(self, phrase: Text):
        """
        $ python -m sagas.ko.ko_helper nouns '피자와 스파게티가'
        $ python -m sagas.ko.ko_helper nouns '계획이'
        :param phrase:
        :return:
        """
        from sagas.nlu.transliterations import translits
        from sagas.ko.kwn_procs import kwn
        ns = self.mecab.nouns(phrase)
        rs = []
        for w in ns:
            # ws = get_word_sets(w, 'ko')
            ws = kwn.get_synsets(w, first=True)
            if ws:
                rs.append({
                    'spec': ws[0].name(),
                    'text': w,
                    'translit': translits.translit(w, 'ko'),
                    'definition': ws[0].definition()
                })
            else:
                rs.append({
                    'text': w,
                    'translit': translits.translit(w, 'ko'),
                })
        return rs

    def translit(self, word):
        """
        $ python -m sagas.ko.ko_helper translit '피자와 스파게티가'

        See also: procs-ko-konlpy.ipynb
        :param word:
        :return:
        """
        from sagas.nlu.transliterations import translits
        for w, p in self.mecab.pos(word):
            expl = '_'
            if p in ('NNG', 'VV'):
                ws = get_word_sets(w, 'ko')
                if ws:
                    expl = f"{ws['name']}({ws['definition']})"
            print(w, translits.translit(w, 'ko'), p, expl)
Beispiel #16
0
def get_tokens(x):
    mecab = Mecab()
    try:
        return [i for i in mecab.nouns(x) if len(i) > 1] if x else []
    except Exception as e:
        if str(x) == 'nan':
            return []
        print(e)
        print(str(x))
        raise e
    def text_tokenizing(self, doc):
        mecab = Mecab()
        SW = self.define_stopwords("./data/stopwords-ko.txt")

        if self.ui.rb_noun.isChecked():
            return [word for word in mecab.nouns(doc) if word not in SW and len(word) > 1]
        elif self.ui.rb_morphs.isChecked():
            return [word for word in mecab.morphs(doc) if word not in SW and len(word) > 1]
        elif self.ui.rb_words.isChecked():
            return [word for word in mecab.words(doc) if word not in SW and len(word) > 1]
Beispiel #18
0
def noun_extraction(data_list):
    print("noun_extraction start")
    nouns_list = []
    mecab = Mecab()

    for data in data_list:
        noun = mecab.nouns(data[0])
        nouns_list.append([noun, data[1], data[2]])

    # print(json.dumps(nouns_list, ensure_ascii=False, indent=3))
    return nouns_list
Beispiel #19
0
def get_nouns(text):
    tagger = Mecab()
    keyword_list = []
    noun = tagger.nouns(text)
    noun = [i for i in noun if len(i) > 1]
    noun = str(noun).replace('[',
                             '').replace(']',
                                         '').replace(',',
                                                     ' ').replace("'", '')

    return noun
Beispiel #20
0
def clean_str(s):
    """Clean sentence"""
    global counter_konlpy
    global total_dataset
    s = re.sub('[0-9]', '', s)

    mecab = Mecab()
    result = mecab.nouns(s)
    counter_konlpy += 1
    sys.stdout.write("\rParsed: %d / %d" % (counter_konlpy, total_dataset))
    sys.stdout.flush()
    return ' '.join(result)
Beispiel #21
0
def read_text(fin):
    # 전처리된 위키백과 파일을 읽어 들입니다.
    corpus_li = []
    mecab = Mecab(dicpath='/opt/local/lib/mecab/dic/mecab-ko-dic')
    for line in open(fin):
        # 깨지는 글자를 처리하기 위해 unicodedata.normalize 함수를 이용해
        # NFKC로변환합니다.
        line = unicodedata.normalize('NFKC', line)
        try:
            # 첫 글자가 숫자로 시작하는 문장을 말뭉치에 추가합니다.
            _ = int(line[0])
            corpus_li.append(' '.join(mecab.nouns(line)) + '\n')

        except ValueError:
            # 첫 글자가 한글로 시작하는 문장을 말뭉치에 추가합니다.
            if ord(line[0]) >= ord('가') and ord(line[0]) <= ord('힇'):
                corpus_li.append(' '.join(mecab.nouns(line)) + '\n')
            else:
                pass
    print('# of lines in corpus', len(corpus_li))
    return (corpus_li)
Beispiel #22
0
def to_nouns(docs, version):
    if version == 'mecab':
        parser = Mecab()
    else:
        parser = konlpy_import(version)
    nounslist = []
    for doc in docs:
        try:
            nouns = ' '.join(parser.nouns(doc))
        except:
            pass
        nounslist.append(nouns)
    return nounslist
Beispiel #23
0
def count(body):
    mecab = Mecab()
    if len(body) != 0:
        text = body[0].get_text()

        f = open("../keyword.txt", 'r')
        while True:
            line = f.readline()
            line = line.replace('\n', '')
            if not line: break
            text = text.replace(line, '부실')
        f.close()

        nouns = mecab.nouns(text)
        print(nouns.count('부실'))
Beispiel #24
0
def clean_str(s):
    """Clean sentence"""
    global counter_konlpy
    global total_dataset
    s = re.sub('[0-9]', '', s)
    #kkma = Kkma()
    # komoran  = Komoran()
    # twitter = Twitter()
    mecab = Mecab()
    #print(' '.join(kkma.nouns(s)))
    result = []
    for aLine in s.split(';'):
        result.append(' '.join(mecab.nouns(aLine)))
    counter_konlpy += 1
    sys.stdout.write("\r Parsed: %d / %d" % (counter_konlpy, total_dataset))
    sys.stdout.flush()
    return ' '.join(result)
Beispiel #25
0
def is_contained_bad_word(text, type):
    # BadWord 테이블에 type 조건으로 키워드 전체 조회
    # konlpy 라이브러리로 체크
    mecab = Mecab()
    mecab_list = mecab.nouns(text)
    # DB에서 제외키워드 조회
    badword_list = BadWord.objects.filter(type=type).values_list('keyword',
                                                                 flat=True)

    for badword in badword_list:
        for mecabword in mecab_list:
            badword_en = badword.encode('utf-8')
            mecabword_en = mecabword.encode('utf-8')
            if badword_en == mecabword_en:
                return True
    # if any (word in BadWord.objects.all().__str__() for word in mecab.nouns(text)):
    #     return True
    return False
Beispiel #26
0
def result(request):
    text = request.GET['fulltext']
    nlpy = Mecab()
    nouns = nlpy.nouns(text)
    word_dictionary = {}

    for word in nouns:
        if word in word_dictionary:
            word_dictionary[word] += 1
        else:
            word_dictionary[word] = 1

    return render(
        request, 'result.html', {
            'noun': nouns,
            'full': text,
            'total': len(nouns),
            'dictionary': word_dictionary.items()
        })
def text_tokenize(corpus):
    mecab = Mecab()
    token_corpus = []
    if w.ui.rb_noun.isChecked():
        for n in range(len(corpus)):
            token_text = mecab.nouns(corpus[n])
            token_text = [word for word in token_text if word not in SW]
            token_corpus.append(token_text)
    if w.ui.rb_morphs.isChecked():
        for n in range(len(corpus)):
            token_text = mecab.morphs(corpus[n])
            token_text = [word for word in token_text if word not in SW]
            token_corpus.append(token_text)
    if w.ui.rb_words.isChecked():
        for n in range(len(corpus)):
            token_text = corpus[n].split()
            token_text = [word for word in token_text if word not in SW]
            token_corpus.append(token_text)
    return token_corpus
Beispiel #28
0
def emotion():
    title = request.form['title']
    comment = request.form['content']
    uid = request.form['uid']

    from konlpy.tag import Mecab
    mecab = Mecab("C:\mecab\mecab-ko-dic")
    token_data = []
    token = mecab.nouns(str(comment))
    token_data.append(token)
    series_token_data = pd.Series(token_data)

    # 모델 불러오기
    fastText_model = fasttext.FastText.load("./embedding/tweet_fastText_0717.model")
    docs_vectors_ft = pd.DataFrame()
    for doc in series_token_data:
        temp = pd.DataFrame()
        for word in doc:
            ft = fastText_model[word]
            temp = temp.append(pd.Series(ft), ignore_index=True)
        # take the average of each column(w0, w1, w2,........w300)
        doc_vector_ft = temp.mean()
        # append each document value to the final dataframe
        docs_vectors_ft = docs_vectors_ft.append(doc_vector_ft, ignore_index=True)

    from sklearn.externals import joblib
    # pickled binary file 형태로 저장된 객체를 로딩한다
    file_name = './embedding/tweet_bagg_SVM.pkl'
    model = joblib.load(file_name)

    pred = model.predict(docs_vectors_ft)

    # print("DB 연결중")
    conn = getConnection()
    curs = conn.cursor()

    sql = "UPDATE emo_board SET EBEMO = '"+pred[0]+"' WHERE EBTITLE = '"\
          +title+"' AND EBCONTENT = '"+comment+"' AND UID = '"+uid+"'"
    curs.execute(sql)
    curs.close()
    conn.close()
    return "성공"
Beispiel #29
0
def preprocess(
    data_path: str,
    word_index: dict = None,
    num_words: int = 10000,
):
    tokenizer = Mecab()

    # 0. data load
    with open(data_path, "rb") as f:
        data = pickle.load(f)

    # 1. bag-of-words
    vocab, docs = [], []
    for doc in tqdm(data):
        if doc:
            # nsmc 데이터에 nan값을 제외해주기 위함
            try:
                nouns = tokenizer.nouns(doc)
                vocab.extend(nouns)
                docs.append(nouns)
            except:
                continue

    # 2. build vocab
    if not word_index:
        vocab = Counter(vocab)
        vocab = vocab.most_common(num_words)

        # 3. add unknwon token
        word_index = {"<UNK>": 0}
        for idx, (word, _) in enumerate(vocab, 1):
            word_index[word] = idx

    index_word = {idx: word for word, idx in word_index.items()}

    # 4. create corpus
    corpus = []
    for doc in docs:
        if doc:
            corpus.append([word_index.get(word, 0) for word in doc])

    return corpus, word_index, index_word
Beispiel #30
0
def Get_relevant_articles(search_string, num_of_articles=1000, date=time.strftime("%Y-%m-%d"), host='localhost', port=9200, index='ko_news_articles'):
    '''
        Input:  String (Required)
                max number of related articles to return
                date origin point parameter (default=now)
                ES host address
                ES port number
                index name
        Output: list of dictionaries containing the article infos, index id, similarity score, etc.
    ----------------------------------------------------------------------------
    '''

    mecab = Mecab()
    search_string_lem = ' '.join(mecab.nouns(search_string))
    es = Elasticsearch([{'host': host, 'port': port}])

    Output = es.search(index=index, size=num_of_articles, body={'query': \
                                    {'function_score': \
                                        {'query':\
                                            {'dis_max':\
                                                {'queries': [\
                                                    {'match': {'articleContents': {'query':search_string, 'fuzziness': 'AUTO', 'max_expansions':5, "cutoff_frequency" : 0.001}}},\
                                                    {'match': {'Lemmatized': {'query':search_string_lem, 'fuzziness': 0, 'max_expansions':2, "cutoff_frequency" : 0.001}}}],\
                                                "tie_breaker":0.3\
                                                }\
                                            },\
                                        "functions":\
                                            [{"gauss":\
                                                {"articleDate":\
                                                    {"origin":date,\
                                                    "scale":"30d",\
                                                    "offset":"2d",\
                                                    "decay": .5 \
                                                    }\
                                                }\
                                            }],\
                                            "score_mode":"multiply"\
                                        }\
                                    }\
                                })

    return Output['hits']['hits']