Ejemplo n.º 1
0
def make_tag(data):
    stopwords = {'기자'}

    #전처리
    text = preprocessing(data)
    #문장별로 나눠줌
    texts = text.split('.')
    tag = ''
    # 학습
    try:
        keywords, sents = summarize_with_sentences(texts,
                                                   stopwords=stopwords,
                                                   num_keywords=5,
                                                   num_keysents=3)
        for word, r in sorted(keywords.items(),
                              key=lambda x: x[1],
                              reverse=True)[:5]:
            # print('%8s:\t%.4f' % (word, r))
            # print('#%s' % word)
            tag += '#' + word + ' '
    except ValueError as v:
        # print('#')
        tag = '# '
    # tag에 추가

    return tag
Ejemplo n.º 2
0
def summary(list):
    try:
        penalty = lambda x: 0 if (5 <= len(x) <= 80) else 1  # 문장 길이
        # stopwords = {'이러한', '일단', '제가', '이거', '아니라', '때문에',
        #              '동영상', '도움말', '미지원으로', '드래그', '지원되지않습니다.도움말',
        #              'ㅎㅎ', '중입니다.5분', '퍼가기', 'Object', '마우스를', '인코딩', '음소',
        #              '음소거', 'Flash', '영상의', '소요'}

        keywords, sents = summarize_with_sentences(
            list,
            penalty=penalty,
            # stopwords=stopwords,
            diversity=0.7,
            num_keywords=100,
            num_keysents=10,
            scaling=lambda x: 1,
            beta=0.85,  # PageRank의 decaying factor beta
            max_iter=10,
            verbose=True
        )
        print(keywords)
        return sents
    except ValueError:
        print('key가 없습니다.')
        pass
Ejemplo n.º 3
0
 def summarizer(self,
                text,
                option='krwordrank'):  # 'gensim', 'textrank', 'krwordrank'
     sent_lst = text.split(". ")
     if option == 'krwordrank':
         result = summarize_with_sentences(sent_lst, num_keysents=3)[1]
     elif option == 'gensim':
         result = summarize(text, ratio=3 / len(sent_lst)).split("\n")
     # else:
     #     ks_summarizer = KeysentenceSummarizer(tokenize = self.okt.morphs)
     #     result = list(zip(*ks_summarizer.summarize(sents=sent_lst,topk=3)))[2]
     return result
Ejemplo n.º 4
0
def test_keysentence(test_config):
    data_path = test_config['data_path']
    with open(data_path, encoding='utf-8') as f:
        texts = [line.rsplit('\t')[0].strip() for line in f]

    keywords, sents = summarize_with_sentences(texts,
                                               num_keywords=100,
                                               num_keysents=10)
    for word in ['영화', '너무', '정말', '음악', '마지막']:
        assert word in keywords
    assert len(sents) == 10
    print('\nKR-WordRank key-sentence extraction 라라랜드 영화 리뷰 10 개 핵심 문장')
    for sent in sents:
        print(' - {}'.format(sent))
Ejemplo n.º 5
0
def summary(list):
    penalty = lambda x: 0 if (25 <= len(x) <= 80) else 1  #문장 길이
    stopwords = {'이러한', '일단', '제가', '이거', '아니라', '때문에'}

    keywords, sents = summarize_with_sentences(
        list,
        penalty=penalty,
        stopwords=stopwords,
        diversity=0.7,
        num_keywords=100,
        num_keysents=10,
        scaling=lambda x: 1,
        verbose=False,
    )
    return sents
Ejemplo n.º 6
0
def summary_text(texts):
    texts = texts
    penalty = lambda x: 0 if (25 <= len(x) <= 50) else 1
    stopwords = {'은', '는', '이', '가', '오늘'}
    keywords, sents = summarize_with_sentences(
        texts,
        penalty=penalty,
        stopwords=stopwords,
        diversity=0.7,
        num_keywords=10,
        num_keysents=1,
        scaling=lambda x: 1,
        verbose=False,
    )
    keyword = []

    for sent in sents:
        print(sent)
    for i in keywords:
        keyword.append(i)
    return print(keyword)
Ejemplo n.º 7
0
#이를 토대로 우리가 좀 더 원하는건 okt(twitter) 인듯


from collections import Counter


kkma_candidates = kkma.sentences(content)
nouns = okt.nouns(content)

from krwordrank.sentence import summarize_with_sentences


keywords, sents = summarize_with_sentences(
                                        nouns,
                                        num_keywords=100, 
                                        num_keysents=1
                                        )


import re
from collections import Counter


#널리 알려진 한국어 형태소 분석기들 중에선 빠르고 적절하며 원문을 보존하며 문장 구분을 해주는 기능이 구현된게 없고 
def xplit(*delimiters):
    return lambda value: re.split('|'.join([re.escape(delimiter) for delimiter in delimiters]), value)

xplit('. ', '? ', '! ', '\n', '.\n')("This is a sentence. Here is another sentence.\nHello, world!")

class Sentence:
Ejemplo n.º 8
0
    stopwords = custom_stopwords + default_stopwords
    return set(stopwords)


with open('../data/test4_punct.txt', 'r') as f:
    text = f.read().split('\n')
    text = ' '.join(text)
    text = text.split('. ')

stopwords = get_stopwords()
print(stopwords)
print('====================')
keywords, sents = summarize_with_sentences(text,
                                           stopwords=stopwords,
                                           num_keywords=100,
                                           diversity=0.7,
                                           num_keysents=5,
                                           scaling=lambda x: 1,
                                           verbose=True)
print(list(keywords.items())[:10])
print('====================')
for i, s in enumerate(sents):
    print(i, s)

print('====================')
wordrank_extractor = KRWordRank(
    min_count=3,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length=20,  # 단어의 최대 길이
    verbose=True)
beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10
Ejemplo n.º 9
0
from krwordrank.sentence import summarize_with_sentences

stop = 0

test = "국어 질문있습니다."
st = ''
for r in range(len(test)):
    texts = test[r]
    texts = preprocessing(texts, b_idx)
    st += texts
texts = st.split('. ')
try:
    stopwords = {b_idx.split(' ')[0], b_idx.split(' ')[1]}
    keywords, sents = summarize_with_sentences(texts,
                                               stopwords=stopwords,
                                               num_keywords=100,
                                               num_keysents=10)
except ValueError:
    print('key가 없습니다.')
    print()
    continue

for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:7]:
    #print('%8s:\t%.4f' % (word, r))
    print('#%s' % word)
print()
if stop == 50:
    break
Ejemplo n.º 10
0
                    diversity=0.3,
                    topk=10)

for sent in sents:
    print(sent)

from krwordrank.sentence import summarize_with_sentences

penalty = lambda x: 0 if (10 <= len(x) <= 50) else 1
stopwords = {'잘 부탁드립니다', '부탁드립니다', '잘', '정말', '진짜'}

keywords, sents = summarize_with_sentences(
    texts,
    penalty=penalty,
    stopwords=stopwords,
    diversity=0.7,
    num_keywords=50,
    num_keysents=10,
    scaling=lambda x: 1,
    verbose=False,
)
for sent in sents:
    print(sent)

# json으로 변경
# decode 코드가 주피터에서 안먹긴 하는데, 아마 decode 하면 될거라고 생각
print(json.dumps(sents))
print('')
print(json.dumps(sents, indent=4))
print(type(json.dumps(sents)))
with open('words.json', 'w', encoding="utf-8") as make_file:
    json.dump(sents, make_file, ensure_ascii=False, indent="\t")
Ejemplo n.º 11
0
                    # .”라고 전화고 있다 와 같은경우 방지 ( 판례 본문에서 직접 복사해와야한다.)
                    if content[i][j + 1] == '”':
                        continue
                final_content.append(content[i][sentence_start:j + 1])
                sentence_start = j + 1

    print(k, '번째 글')
    #    print(final_content)

    #    from krwordrank.sentence import summarize_with_sentences

    penalty = lambda x: 0 if (15 <= len(x) <= 90) else 1

    keywords, sents = summarize_with_sentences(final_content,
                                               penalty=penalty,
                                               diversity=0.5,
                                               num_keywords=100,
                                               num_keysents=2,
                                               verbose=False)

    print(keywords)
    print(sents)

    driver.close()

    driver.switch_to.window(old_tab)

    print('----------------------------------')

driver.quit()
'''
from krwordrank.word import KRWordRank
    text = text.split('.')
    if ' ' in text:
        text = list(filter(lambda a: a != ' ', text))
    if '' in text:
        text = list(filter(lambda a: a != '', text))

    print(len(text), text)
    keysents = ''
    if len(text) <= num_keysents:
        keysents = text[0]
        keysents = '. '.join(keysents)
    else:
        keywords, keysents = summarize_with_sentences(
            text,
            diversity=0.7,
            num_keysents=1,
            scaling=lambda x: 1,
            verbose=False,
        )

        keysents = '. '.join(keysents)
        # print(keysents)

    basic_loader._save_text(keysents, idx)

    # score = rouge1(keywords, keysents, mecab_tokenizer)

    # print(idx, score)

    idx += 1
#     lexrank.summarize(text)
Ejemplo n.º 13
0
def summary():
    cur = mysql.connection.cursor()
    user_email = get_jwt_identity()['user_email']
    pre_data = request.get_json()['paragraph']
    emotion = request.get_json()['strength_of_feeling']
    created_data_time = datetime.datetime.utcnow()
    data = []
    data_list = []
    data.append(pre_data)
    for sentence in data:
        list_sentence1 = sentence.split('\n')
        for list_sentence2 in list_sentence1:
            list_sentence = list_sentence2.replace('. ', '.   ...').replace(
                '? ', '?   ...').replace('! ', '!   ...').split('  ...')
            for lines in list_sentence:
                line = lines.strip()
                data_list.append(line)
    data_list1 = list(data_list)
    for i in range(len(data_list)):
        x = data_list1.count('')
        for j in range(x):
            data_list1.remove('')
    texts = data_list1
    penalty = lambda x: 0 if (10 <= len(x) <= 120) else 1
    stopwords = {'오늘', '오늘은'}
    keywords, sents = summarize_with_sentences(texts,
                                               penalty=penalty,
                                               stopwords=stopwords,
                                               diversity=0.5,
                                               num_keywords=7,
                                               num_keysents=3,
                                               scaling=lambda x: 1,
                                               verbose=False,
                                               min_count=1)
    before_sentiment = []
    sentiment = []
    keyword = []
    for sent in sents:
        before_sentiment.append(sent)
    print(before_sentiment)

    def text_input(a):
        global graph
        with graph.as_default():
            txt = []
            txt.append(a)
            text = []
            for sentence in txt:
                temp_X = []
                temp_X = okt.morphs(sentence, stem=True)  # 토큰화
                temp_X = [word for word in temp_X
                          if not word in stopwords]  # 불용어 제거
                text.append(temp_X)
            seq = tokenizer.texts_to_sequences(text)
            padded = pad_sequences(seq, maxlen=max_len)
            pred = model.predict(padded)
            labels = [0, 1, 2, 3, 4]
        return labels[np.argmax(pred)]

    for i in range(3):
        sentiment.append(text_input(a=before_sentiment[i]))
    print(sentiment)

    def find_nearest(array, value):
        n = [abs(i - value) for i in array]
        idx = n.index(min(n))
        return idx

    a = find_nearest(sentiment, emotion)
    summary_text = before_sentiment[a]

    cur.execute(
        "INSERT INTO user_summary (user_email, summary_text, created_data_time) VALUES ('"
        + str(user_email) + "', '" + str(summary_text) + "', '" +
        str(created_data_time) + "')")

    mysql.connection.commit()

    result = {
        'user_email': user_email,
        'summary_text': summary_text,
        'created_data_time': created_data_time
    }

    return jsonify({'result': result})