Example #1
0
def summarize_text(text):
    # < workaround cooes > for crash problem with django and konlpy
    # - related issue : https://github.com/konlpy/konlpy/issues/104
    if jpype.isJVMStarted():
        jpype.attachThreadToJVM()
    # < workaround codes end!!! >

    #print_with_timestamp("textrank init")
    _textrank = TextRank(text)
    #print_with_timestamp("textrank init end")
    return _textrank.summarize()
Example #2
0
def summerize(event, context):
    if not event['body']:
        return {"statusCode": 400, "body": ""}

    textrank = TextRank(event['body'],
                        phraser=ApiPhraser(api_url=phraser_api_url).phrases)

    response = {
        "statusCode": 200,
        "body": textrank.summarize(),
        "headers": {
            "Access-Control-Allow-Origin": "*",
            "Access-Control-Allow-Methods": "POST"
        }
    }
    return response
Example #3
0
def parse_link(url):
    req = requests.get(url)
    soup = bs(req.text, 'html.parser')

    title = str(
        soup.find('h3', {
            'class': 'ending_tit_new'
        }).find('img')['alt'])

    img_url = soup.find('span', {'class': 'img_wrap2'})
    if img_url:
        img_url = str(img_url.find('img')['src'])

    soup = soup.find('div', {'class': 'na_doc'})
    [tag.extract() for tag in soup.find_all('caption')]
    [tag.extract() for tag in soup.find_all('p', {'class': 'cap'})]
    [tag.extract() for tag in soup.find_all('div', {'class': 'tmp_source2'})]
    [tag.extract() for tag in soup.find_all('div', {'id': 'na_author_top'})]
    [tag.extract() for tag in soup.find_all('div', {'class': 't_pdate'})]
    [tag.extract() for tag in soup.find_all('div', {'class': 'na_cmt_bx'})]
    [tag.extract() for tag in soup.find_all('', {'style': 'display:none'})]
    text_all = ' '.join(soup.get_text().split())

    ret = list()
    ret.append(title)
    ret.append(url)
    ret.append(img_url)

    import jpype
    if jpype.isJVMStarted():
        jpype.attachThreadToJVM()
    ret.append(TextRank(text_all).summarize().split('\n'))

    return ret
Example #4
0
class TestTextrankr(unittest.TestCase):
    def setUp(self):
        self.text = "ํŠธ์œ„ํ„ฐ, \"์ •๋ณด๋‹น๊ตญ์— ๋ฐ์ดํ„ฐ ๋ถ„์„์ž๋ฃŒ ํŒ”์ง€ ์•Š๊ฒ ๋‹ค\". ํŠธ์œ„ํ„ฐ๊ฐ€ ์ˆ˜๋งŽ์€ ํŠธ์œ—์„ ๋ถ„์„ํ•ด ์ •๋ณด๋ฅผ ํŒ๋งคํ•˜๋Š” ์„œ๋น„์Šค๋ฅผ ๋ฏธ๊ตญ ์ •๋ณด๋‹น๊ตญ์—๋Š” ์ œ๊ณตํ•˜์ง€ ์•Š๊ธฐ๋กœ ํ–ˆ๋‹ค. ์›”์ŠคํŠธ๋ฆฌํŠธ์ €๋„์€ ๋ฏธ๊ตญ ์ •๋ณด๋‹น๊ตญ ๊ด€๊ณ„์ž ๋“ฑ์„ ์ธ์šฉํ•ด ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ(Dataminer)๊ฐ€ ์ •๋ณด๋‹น๊ตญ์— ๋Œ€ํ•œ ์„œ๋น„์Šค๋Š” ์ค‘๋‹จํ•˜๊ธฐ๋กœ ํ–ˆ๋‹ค๊ณ  9์ผ(ํ˜„์ง€์‹œ๊ฐ„) ๋ณด๋„ํ–ˆ๋‹ค. ํŠธ์œ„ํ„ฐ๊ฐ€ 5% ์ง€๋ถ„์„ ๊ฐ€์ง„ ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ๋Š” ์†Œ์…œ๋ฏธ๋””์–ด์ƒ ์ž๋ฃŒ๋ฅผ ๋ถ„์„ํ•ด ๊ณ ๊ฐ์ด ์˜์‚ฌ๊ฒฐ์ •์„ ํ•˜๋„๋ก ์ •๋ณด๋ฅผ ์ œ๊ณตํ•˜๋Š” ๊ธฐ์—…์ด๋‹ค. ํŠธ์œ„ํ„ฐ์— ์˜ฌ๋ผ์˜ค๋Š” ํŠธ์œ—์— ์‹ค์‹œ๊ฐ„์œผ๋กœ ์ ‘๊ทผํ•ด ๋ถ„์„ํ•œ ์ž๋ฃŒ๋ฅผ ๊ณ ๊ฐ์—๊ฒŒ ํŒ” ์ˆ˜ ์žˆ๋Š” ๋…์ ๊ถŒ์„ ๊ฐ–๊ณ  ์žˆ๋‹ค. ์ •๋ณด๋‹น๊ตญ์€ ์ด ํšŒ์‚ฌ๋กœ๋ถ€ํ„ฐ ๊ตฌ๋งคํ•œ ์ž๋ฃŒ๋กœ ํ…Œ๋Ÿฌ๋‚˜ ์ •์น˜์  ๋ถˆ์•ˆ์ • ๋“ฑ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๋ฅผ ํš๋“ํ–ˆ๋‹ค. ์ด ํšŒ์‚ฌ๊ฐ€ ์ •๋ณด๋‹น๊ตญ์— ์„œ๋น„์Šค๋ฅผ ํŒ๋งคํ•˜์ง€ ์•Š๊ธฐ๋กœ ํ•œ ๊ฒƒ์€ ํŠธ์œ„ํ„ฐ์˜ ๊ฒฐ์ •์ธ ๊ฒƒ์œผ๋กœ ์•Œ๋ ค์กŒ๋‹ค. ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ ๊ฒฝ์˜์ง„์€ ์ตœ๊ทผ โ€œํŠธ์œ„ํ„ฐ๊ฐ€ ์ •๋ณด๋‹น๊ตญ์— ์„œ๋น„์Šคํ•˜๋Š” ๊ฒƒ์„ ์›์น˜ ์•Š๋Š”๋‹คโ€๊ณ  ๋ฐํ˜”๋‹ค๊ณ  ์ด ์‹ ๋ฌธ์€ ์ „ํ–ˆ๋‹ค. ํŠธ์œ„ํ„ฐ๋„ ์„ฑ๋ช…์„ ๋‚ด๊ณ  โ€œ์ •๋ณด๋‹น๊ตญ ๊ฐ์‹œ์šฉ์œผ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ํŒ”์ง€ ์•Š๋Š” ๊ฒƒ์€ ํŠธ์œ„ํ„ฐ์˜ ์˜ค๋ž˜๋œ ์ •์ฑ…โ€์ด๋ผ๋ฉฐ โ€œํŠธ์œ„ํ„ฐ ์ž๋ฃŒ๋Š” ๋Œ€์ฒด๋กœ ๊ณต๊ฐœ์ ์ด๊ณ  ๋ฏธ๊ตญ ์ •๋ถ€๋„ ๋‹ค๋ฅธ ์‚ฌ์šฉ์ž์ฒ˜๋Ÿผ ๊ณต๊ฐœ๋œ ์–ด์นด์šดํŠธ๋ฅผ ์‚ดํŽด๋ณผ ์ˆ˜ ์žˆ๋‹คโ€๊ณ  ํ•ด๋ช…ํ–ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์ด๋Š” ์ด ํšŒ์‚ฌ๊ฐ€ 2๋…„ ๋™์•ˆ ์ •๋ณด๋‹น๊ตญ์— ์„œ๋น„์Šค๋ฅผ ์ œ๊ณตํ•ด ์˜จ ๋ฐ ๋Œ€ํ•ด์„œ๋Š” ํƒ€๋‹นํ•œ ์„ค๋ช…์ด ๋˜์ง€ ์•Š๋Š”๋‹ค. ํŠธ์œ„ํ„ฐ์˜ ์ด๋ฒˆ ๊ฒฐ์ •์€ ๋ฏธ๊ตญ์˜ ์ •๋ณด๊ธฐ์ˆ (IT)๊ธฐ์—…๊ณผ ์ •๋ณด๋‹น๊ตญ ๊ฐ„ ๊ฐˆ๋“ฑ์˜ ์—ฐ์žฅ ์„ ์ƒ์—์„œ ์ด๋ค„์ง„ ๊ฒƒ์œผ๋กœ ์—ฌ๊ฒจ์ง€๊ณ  ์žˆ๋‹ค. IT๊ธฐ์—…์€ ์ด์šฉ์ž ํ”„๋ผ์ด๋ฒ„์‹œ์— ๋ฌด๊ฒŒ ์ค‘์‹ฌ์„ ๋‘๋Š” ๋ฐ ๋น„ํ•ด ์ •๋ณด๋‹น๊ตญ์€ ๊ณต๊ณต์•ˆ์ „์„ ์šฐ์„ ์‹œํ•ด ์ฐจ์ด๊ฐ€ ์žˆ์—ˆ๋‹ค. ํŠนํžˆ ์• ํ”Œ์€ ์บ˜๋ฆฌํฌ๋‹ˆ์•„ ์ฃผ ์ƒŒ๋ฒ„๋„ˆ๋””๋…ธ ์ด๊ฒฉ๋ฒ”์˜ ์•„์ดํฐ์— ์ €์žฅ๋œ ์ •๋ณด๋ฅผ ๋ณด๊ฒ ๋‹ค๋ฉฐ ๋ฐ์ดํ„ฐ ์ž ๊ธˆ์žฅ์น˜ ํ•ด์ œ๋ฅผ ์š”๊ตฌํ•˜๋Š” ๋ฏธ ์—ฐ๋ฐฉ์ˆ˜์‚ฌ๊ตญ(FBI)๊ณผ ์†Œ์†ก๊นŒ์ง€ ์ง„ํ–‰ํ–ˆ๋‹ค. ์ •๋ณด๋‹น๊ตญ ๊ณ ์œ„ ๊ด€๊ณ„์ž๋„ โ€œํŠธ์œ„ํ„ฐ๊ฐ€ ์ •๋ณด๋‹น๊ตญ๊ณผ ๋„ˆ๋ฌด ๊ฐ€๊นŒ์›Œ ๋ณด์ด๋Š” ๊ฒƒ์„ ์šฐ๋ คํ•˜๋Š” ๊ฒƒ ๊ฐ™๋‹คโ€๊ณ  ๋งํ–ˆ๋‹ค. ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ๋Š” ๊ธˆ์œต๊ธฐ๊ด€์ด๋‚˜, ์–ธ๋ก ์‚ฌ ๋“ฑ ์ •๋ณด๋‹น๊ตญ์„ ์ œ์™ธํ•œ ๊ณ ๊ฐ์— ๋Œ€ํ•œ ์„œ๋น„์Šค๋Š” ๊ณ„์†ํ•  ๊ณ„ํš์ด๋‹ค. ."
        self.textrank = TextRank(self.text)

    def test_ranked(self):
        results = self.textrank.summarize(3, verbose=False)
        self.assertEqual(len(results), 3)
        self.assertEqual(results[0], "ํŠธ์œ„ํ„ฐ, \"์ •๋ณด๋‹น๊ตญ์— ๋ฐ์ดํ„ฐ ๋ถ„์„์ž๋ฃŒ ํŒ”์ง€ ์•Š๊ฒ ๋‹ค\".")

    def test_verbose(self):
        result = self.textrank.summarize(1, verbose=True)
        self.assertEqual(result, "ํŠธ์œ„ํ„ฐ, \"์ •๋ณด๋‹น๊ตญ์— ๋ฐ์ดํ„ฐ ๋ถ„์„์ž๋ฃŒ ํŒ”์ง€ ์•Š๊ฒ ๋‹ค\".")

    def test_sentence(self):
        sent = self.textrank.sentences[0]
        self.assertEqual(str(sent), "ํŠธ์œ„ํ„ฐ, \"์ •๋ณด๋‹น๊ตญ์— ๋ฐ์ดํ„ฐ ๋ถ„์„์ž๋ฃŒ ํŒ”์ง€ ์•Š๊ฒ ๋‹ค\".")
Example #5
0
def summareader():

    url = "https://news.naver.com"
    context = ssl._create_unverified_context()

    response = urlopen.urlopen(url, context=context)

    objBS = bs4.BeautifulSoup(response, "html.parser")
    news_item = objBS.find_all("ul", {"class": "section_list_ranking"})

    naverurl = []
    newstitle = []
    article = []

    for nws in news_item:
        txt = nws.find_all("a")
        for we in txt:
            k = "https://news.naver.com" + we.get('href')
            title = we.text.strip()
            naverurl.append(k)
            newstitle.append(title)

    for i in range(len(naverurl)):
        url = naverurl[i]
        context2 = ssl._create_unverified_context()
        response2 = urlopen.urlopen(url, context=context)
        objBS2 = bs4.BeautifulSoup(response2, "html.parser")

        newstime = str(objBS2.select('.t11'))
        newstime = re.sub('<.+?>', '', newstime, 0, re.I | re.S)
        newscontent = str(
            objBS2.find("div", {"class": "_article_body_contents"}))
        newscontent = re.sub('<script.*?>.*?</script>', '', newscontent, 0,
                             re.I | re.S)
        text = re.sub('<.+?>', '', newscontent, 0, re.I | re.S)
        articlecontent = text

        textrank = TextRank(articlecontent)
        suma = textrank.summarize(3)
        article.append(suma)
        print(article[i])
        print(
            "--------------------------------------------------------------------------------------------"
        )
    return newstitle, article
Example #6
0
def summarize_text(text):
    # < workaround cooes > for crash problem with django and konlpy
    # - related issue : https://github.com/konlpy/konlpy/issues/104
    if jpype.isJVMStarted():
        jpype.attachThreadToJVM()
    # < workaround codes end!!! >

    #print_with_timestamp("textrank init")
    #ignore_words = ["@newsworks.kr"]
    ignore_words = []

    _textrank = TextRank(text, ignore_words)

    previous_sentence = ""
    for s in _textrank.get_sentences():
        if len(previous_sentence) > 0 and previous_sentence == s:
            return ""
        if len(s) > SENTENCE_MAX_SIZE:
            return ""

        previous_sentence = s

    #print_with_timestamp("textrank init end")
    return _textrank.summarize()
def summareader():
    url="https://news.naver.com"
    context=ssl._create_unverified_context()

    response=urlopen.urlopen(url, context=context)


    objBS= bs4.BeautifulSoup(response, "html.parser")
    news_item=objBS.find_all("ul",{"class":"section_list_ranking"})
    naverurl=[]
    newstitle=[]
    article=[]

    for nws in news_item:
        txt=nws.find_all("a")
        for we in txt:
            k = "https://news.naver.com"+we.get('href')
            title=we.text.strip()
            naverurl.append(k)
            newstitle.append(title)

    for i in range(len(naverurl)):
        news = Article(naverurl[i], language='ko')
        news.download()
        news.parse()
        textrank = TextRank(news.text)
        suma=textrank.summarize(3)
        article.append(suma)
        # print(article[i],"\n-----------------------------------------------------------------------------------\n")
        # print(newstitle[i], "\n-----------------------------------------------------------------------------------\n")

    return newstitle, article

# test = summareader()
# print(test[0][0])
# print(test[1][0])
Example #8
0
def total_work(total_train):
    new_text = []
    mytokenizer: MyTokenizer = MyTokenizer()
    textrank: TextRank = TextRank(mytokenizer)
    for i in range(0, len(total_train)):
        if (total_train.loc[i, "๋ฌธ์žฅ ๊ฐœ์ˆ˜"] >= 8):
            k: int = 8
            summarized: str = textrank.summarize(total_train.loc[i, "์‹œ"], k)
            new_text.append(summarized + "\n")
        else:
            m = (7 - total_train.loc[i, "๋ฌธ์žฅ ๊ฐœ์ˆ˜"]) // 2 + 1
            k = total_train.loc[i, "๋ฌธ์žฅ ๊ฐœ์ˆ˜"]
            main_summarized: str = textrank.summarize(total_train.loc[i, "์‹œ"],
                                                      m)
            summarized: str = textrank.summarize(total_train.loc[i, "์‹œ"], k)

            if (m > 2):
                main_list = main_summarized.split("\n")
                for i in range(0, m):
                    summarized = summarized.replace(main_summarized[i], "")
                new_text.append(main_summarized + "\n" + summarized + "\n" +
                                main_summarized + "\n")
            else:
                new_text.append(main_summarized + "\n" + summarized + "\n" +
                                main_summarized + "\n")

    text_model = markovify.NewlineText(new_text, state_size=2)

    lyrics = []
    for i in range(8):
        lyrics.append(text_model.make_sentence())

    lyrics = list(filter(lambda a: a != None, lyrics))

    final = []
    for i in range(0, len(lyrics)):
        if len(lyrics[i]) > 50:
            final.append(lyrics[i][:50])
            final.append(lyrics[i][50:])
        else:
            final.append(lyrics[i])
    if not final:
        total_work(total_train)
    else:
        return final
    return final
Example #9
0
def textfile_similarity():
    data = request.get_json()
    data = text_parse_sen(data)
    mytokenizer: MyTokenizer = MyTokenizer()
    textrank: TextRank = TextRank(mytokenizer)

    k: int = 3  # num sentences in the resulting summary

    summarized: str = textrank.summarize(data, k)
    # print(summarized)  # gives you some text

    # if verbose=False, it returns a list
    summaries: List[str] = textrank.summarize(data, k, verbose=False)
    # for summary in summaries:
    #     print(summary)
    if request.method == 'POST':
        return json.dumps(summaries, ensure_ascii=False)
    return '3sentence'
Example #10
0
def parse_link(url):
    req = requests.get(url)
    soup = bs(req.text, 'html.parser')

    title = str(soup.find('h3', {'class': 'ending_tit_new'}).find('img')['alt'])
    img_url = soup.find('span', {'class': 'img_wrap2'})
    if img_url:
        img_url = str(img_url.find('img')['src'])

    soup = soup.find('div', {'class': 'na_doc'})
    [tag.extract() for tag in soup.find_all('p', {'class': 'cap'})]
    [tag.extract() for tag in soup.find_all('caption')]
    dl = soup.find('dl', {'class': 'na_reference'})
    if not dl:
        dl = soup.find('div', {'id': 'na_author_top'})
    if not dl:
        [tag.extract() for tag in [tag for tag in dl.next_elements]]
    text_all = str(soup)
    
    helper = HTML2Text()
    helper.ignore_links = True
    helper.ignore_images = True
    helper.ignore_tables = True
    text_all = helper.handle(text_all)
    
    for ch in ['#', '/', '*', '_', '>', '&gt', '&lt', ';', ':', '\\']:
        text_all = text_all.replace(ch, ' ')
    text_all = ' '.join(text_all.split())

    ret = list()
    ret.append(title)
    ret.append(url)
    ret.append(img_url)
    
    import jpype
    if jpype.isJVMStarted():
        jpype.attachThreadToJVM()
    ret.append(TextRank(text_all).summarize().split('\n'))

    return ret
Example #11
0
from __future__ import print_function
from textrankr import TextRank

textrank = TextRank("์‹œ๋Š” ์ธ๊ฐ„์˜ ์‚ถ์„ ๋ฐ˜์˜ํ•œ๋‹ค. ์‹œ์—์„œ ๋ฐ˜์˜์€ ํ˜„์‹ค๊ณผ ์ธ์ƒ์„ ๋ชจ๋ฐฉํ•œ๋‹ค๋Š” ์˜๋ฏธ์—์„œ ์™ธ๋ถ€ ํ˜„์‹ค์„ ์‹œ ์†์— ๋‹ด์•„๋‚ด๋Š”๊ฒƒ์œผ๋กœ, ์—ญ์‚ฌ์™€ ํ˜„์‹ค์˜ ์ƒํ™ฉ์„ ์‹œ๋ฅผ ํ†ตํ•ด ์–ด๋–ป๊ฒŒ ์žฌํ˜„ํ• ๊ฒƒ์ธ๊ฐ€์— ์ดˆ์ ์„ ๋‘”๋‹ค. ์—ฌ๊ธฐ์„œ ๋ฐ˜์˜์€ โ€˜์žˆ๋Š” ๊ทธ๋Œ€๋กœ์˜ ํ˜„์‹คโ€™๋กœ์„œ์˜ ๋ฐ˜์˜๊ณผ โ€˜์žˆ์–ด์•ผ ํ•˜๋Š” ํ˜„์‹คโ€™๋กœ์„œ์˜ ๋ฐ˜์˜์œผ๋กœ ๊ตฌ๋ถ„ํ• ์ˆ˜ ์žˆ๋‹ค. ์ „์ž๋Š” ์—ญ์‚ฌ์™€ ํ˜„์‹ค์˜ ๋ชจ์Šต์„ ์‚ฌ์‹ค ๊ทธ๋Œ€๋กœ ๋ณด์—ฌ์ฃผ๋Š” ์ผ์ƒ์  ์ง„์‹ค์„ ๋ฐ˜์˜ํ•˜๋Š” ๊ฒƒ์„ ๋งํ•˜๊ณ , ํ›„์ž๋Š” ์ผ์ƒ์ ํ˜„์‹ค์„ ๋„˜์–ด ํ™”์ž๊ฐ€ ์ง€ํ–ฅํ•˜๋Š” ๋‹น์œ„์  ์ง„์‹ค์„ ๋ฐ˜์˜ํ•˜๋Š” ๊ฒƒ์„๋งํ•œ๋‹ค.")
print(textrank.summarize())
Example #12
0
def summarize_text(text):
    jpype.attachThreadToJVM()
    textrank = TextRank(text)
    return textrank.summarize()
Example #13
0
 def setUp(self) -> None:
     self.text: str = "ํŠธ์œ„ํ„ฐ, \"์ •๋ณด๋‹น๊ตญ์— ๋ฐ์ดํ„ฐ ๋ถ„์„์ž๋ฃŒ ํŒ”์ง€ ์•Š๊ฒ ๋‹ค\". ํŠธ์œ„ํ„ฐ๊ฐ€ ์ˆ˜๋งŽ์€ ํŠธ์œ—์„ ๋ถ„์„ํ•ด ์ •๋ณด๋ฅผ ํŒ๋งคํ•˜๋Š” ์„œ๋น„์Šค๋ฅผ ๋ฏธ๊ตญ ์ •๋ณด๋‹น๊ตญ์—๋Š” ์ œ๊ณตํ•˜์ง€ ์•Š๊ธฐ๋กœ ํ–ˆ๋‹ค. ์›”์ŠคํŠธ๋ฆฌํŠธ์ €๋„์€ ๋ฏธ๊ตญ ์ •๋ณด๋‹น๊ตญ ๊ด€๊ณ„์ž ๋“ฑ์„ ์ธ์šฉํ•ด ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ(Dataminer)๊ฐ€ ์ •๋ณด๋‹น๊ตญ์— ๋Œ€ํ•œ ์„œ๋น„์Šค๋Š” ์ค‘๋‹จํ•˜๊ธฐ๋กœ ํ–ˆ๋‹ค๊ณ  9์ผ(ํ˜„์ง€์‹œ๊ฐ„) ๋ณด๋„ํ–ˆ๋‹ค. ํŠธ์œ„ํ„ฐ๊ฐ€ 5% ์ง€๋ถ„์„ ๊ฐ€์ง„ ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ๋Š” ์†Œ์…œ๋ฏธ๋””์–ด์ƒ ์ž๋ฃŒ๋ฅผ ๋ถ„์„ํ•ด ๊ณ ๊ฐ์ด ์˜์‚ฌ๊ฒฐ์ •์„ ํ•˜๋„๋ก ์ •๋ณด๋ฅผ ์ œ๊ณตํ•˜๋Š” ๊ธฐ์—…์ด๋‹ค. ํŠธ์œ„ํ„ฐ์— ์˜ฌ๋ผ์˜ค๋Š” ํŠธ์œ—์— ์‹ค์‹œ๊ฐ„์œผ๋กœ ์ ‘๊ทผํ•ด ๋ถ„์„ํ•œ ์ž๋ฃŒ๋ฅผ ๊ณ ๊ฐ์—๊ฒŒ ํŒ” ์ˆ˜ ์žˆ๋Š” ๋…์ ๊ถŒ์„ ๊ฐ–๊ณ  ์žˆ๋‹ค. ์ •๋ณด๋‹น๊ตญ์€ ์ด ํšŒ์‚ฌ๋กœ๋ถ€ํ„ฐ ๊ตฌ๋งคํ•œ ์ž๋ฃŒ๋กœ ํ…Œ๋Ÿฌ๋‚˜ ์ •์น˜์  ๋ถˆ์•ˆ์ • ๋“ฑ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๋ฅผ ํš๋“ํ–ˆ๋‹ค. ์ด ํšŒ์‚ฌ๊ฐ€ ์ •๋ณด๋‹น๊ตญ์— ์„œ๋น„์Šค๋ฅผ ํŒ๋งคํ•˜์ง€ ์•Š๊ธฐ๋กœ ํ•œ ๊ฒƒ์€ ํŠธ์œ„ํ„ฐ์˜ ๊ฒฐ์ •์ธ ๊ฒƒ์œผ๋กœ ์•Œ๋ ค์กŒ๋‹ค. ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ ๊ฒฝ์˜์ง„์€ ์ตœ๊ทผ โ€œํŠธ์œ„ํ„ฐ๊ฐ€ ์ •๋ณด๋‹น๊ตญ์— ์„œ๋น„์Šคํ•˜๋Š” ๊ฒƒ์„ ์›์น˜ ์•Š๋Š”๋‹คโ€๊ณ  ๋ฐํ˜”๋‹ค๊ณ  ์ด ์‹ ๋ฌธ์€ ์ „ํ–ˆ๋‹ค. ํŠธ์œ„ํ„ฐ๋„ ์„ฑ๋ช…์„ ๋‚ด๊ณ  โ€œ์ •๋ณด๋‹น๊ตญ ๊ฐ์‹œ์šฉ์œผ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ํŒ”์ง€ ์•Š๋Š” ๊ฒƒ์€ ํŠธ์œ„ํ„ฐ์˜ ์˜ค๋ž˜๋œ ์ •์ฑ…โ€์ด๋ผ๋ฉฐ โ€œํŠธ์œ„ํ„ฐ ์ž๋ฃŒ๋Š” ๋Œ€์ฒด๋กœ ๊ณต๊ฐœ์ ์ด๊ณ  ๋ฏธ๊ตญ ์ •๋ถ€๋„ ๋‹ค๋ฅธ ์‚ฌ์šฉ์ž์ฒ˜๋Ÿผ ๊ณต๊ฐœ๋œ ์–ด์นด์šดํŠธ๋ฅผ ์‚ดํŽด๋ณผ ์ˆ˜ ์žˆ๋‹คโ€๊ณ  ํ•ด๋ช…ํ–ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์ด๋Š” ์ด ํšŒ์‚ฌ๊ฐ€ 2๋…„ ๋™์•ˆ ์ •๋ณด๋‹น๊ตญ์— ์„œ๋น„์Šค๋ฅผ ์ œ๊ณตํ•ด ์˜จ ๋ฐ ๋Œ€ํ•ด์„œ๋Š” ํƒ€๋‹นํ•œ ์„ค๋ช…์ด ๋˜์ง€ ์•Š๋Š”๋‹ค. ํŠธ์œ„ํ„ฐ์˜ ์ด๋ฒˆ ๊ฒฐ์ •์€ ๋ฏธ๊ตญ์˜ ์ •๋ณด๊ธฐ์ˆ (IT)๊ธฐ์—…๊ณผ ์ •๋ณด๋‹น๊ตญ ๊ฐ„ ๊ฐˆ๋“ฑ์˜ ์—ฐ์žฅ ์„ ์ƒ์—์„œ ์ด๋ค„์ง„ ๊ฒƒ์œผ๋กœ ์—ฌ๊ฒจ์ง€๊ณ  ์žˆ๋‹ค. IT๊ธฐ์—…์€ ์ด์šฉ์ž ํ”„๋ผ์ด๋ฒ„์‹œ์— ๋ฌด๊ฒŒ ์ค‘์‹ฌ์„ ๋‘๋Š” ๋ฐ ๋น„ํ•ด ์ •๋ณด๋‹น๊ตญ์€ ๊ณต๊ณต์•ˆ์ „์„ ์šฐ์„ ์‹œํ•ด ์ฐจ์ด๊ฐ€ ์žˆ์—ˆ๋‹ค. ํŠนํžˆ ์• ํ”Œ์€ ์บ˜๋ฆฌํฌ๋‹ˆ์•„ ์ฃผ ์ƒŒ๋ฒ„๋„ˆ๋””๋…ธ ์ด๊ฒฉ๋ฒ”์˜ ์•„์ดํฐ์— ์ €์žฅ๋œ ์ •๋ณด๋ฅผ ๋ณด๊ฒ ๋‹ค๋ฉฐ ๋ฐ์ดํ„ฐ ์ž ๊ธˆ์žฅ์น˜ ํ•ด์ œ๋ฅผ ์š”๊ตฌํ•˜๋Š” ๋ฏธ ์—ฐ๋ฐฉ์ˆ˜์‚ฌ๊ตญ(FBI)๊ณผ ์†Œ์†ก๊นŒ์ง€ ์ง„ํ–‰ํ–ˆ๋‹ค. ์ •๋ณด๋‹น๊ตญ ๊ณ ์œ„ ๊ด€๊ณ„์ž๋„ โ€œํŠธ์œ„ํ„ฐ๊ฐ€ ์ •๋ณด๋‹น๊ตญ๊ณผ ๋„ˆ๋ฌด ๊ฐ€๊นŒ์›Œ ๋ณด์ด๋Š” ๊ฒƒ์„ ์šฐ๋ คํ•˜๋Š” ๊ฒƒ ๊ฐ™๋‹คโ€๊ณ  ๋งํ–ˆ๋‹ค. ๋ฐ์ดํ„ฐ๋งˆ์ด๋„ˆ๋Š” ๊ธˆ์œต๊ธฐ๊ด€์ด๋‚˜, ์–ธ๋ก ์‚ฌ ๋“ฑ ์ •๋ณด๋‹น๊ตญ์„ ์ œ์™ธํ•œ ๊ณ ๊ฐ์— ๋Œ€ํ•œ ์„œ๋น„์Šค๋Š” ๊ณ„์†ํ•  ๊ณ„ํš์ด๋‹ค. ."
     self.tokenizer: OktTokenizer = OktTokenizer()
     self.textrank: TextRank = TextRank(self.tokenizer)
Example #14
0
        for sent1, sent2 in combinations(self.sentences, 2):
            weight = self._jaccard(sent1, sent2)
            if weight:
                self.graph.add_edge(sent1, sent2, weight=weight)

    def _jaccard(self, sent1, sent2):
        p = sum((sent1.bow & sent2.bow).values())
        q = sum((sent1.bow | sent2.bow).values())
        return p / q if q else 0

    def summarize(self, count=3, verbose=True):
        results = sorted(self.reordered[:count],
                         key=lambda sentence: sentence.index)
        results = [result.text for result in results]
        if verbose:
            return '\n'.join(results)
        else:
            return results


from textrankr import TextRank
if __name__ == '__main__':
    text = ''
    f = open('file.txt', 'r')
    for line in f:
        text = text + line
    # print(text)

    textrank = TextRank(text)
    print(textrank.summarize())
    f.close()
Example #15
0
from typing import List
from textrankr import TextRank
from konlpy.tag import Okt


class OktTokenizer:
    okt: Okt = Okt()

    def __call__(self, text: str) -> List[str]:
        tokens: List[str] = self.okt.phrases(text)
        return tokens


okt_tokenizer: OktTokenizer = OktTokenizer()
textrank: TextRank = TextRank(okt_tokenizer)

# num sentences in the resulting summary
k: int = 5
with open('../data/test1_punct.txt', 'r') as f:
    text = f.read().split('\n')
    text = ' '.join(text)

# summarized: str = textrank.summarize(text, k)
# print(summarized)

# if verbose=False, it returns a list
summaries: List[str] = textrank.summarize(text, k, verbose=False)
for i, summary in enumerate(summaries):
    print(i, summary)
Example #16
0
 def test_ranked(self):
     textrank = TextRank(self.text)
     print(textrank.summarize())
Example #17
0
 def decode(self, text, count=3):
     return TextRank(text).summarize(count)
Example #18
0
 def test_ranked(self):
     textrank = TextRank(self.text)
     self.assertEqual(textrank.summarize(1),
                      "ํŠธ์œ„ํ„ฐ, \"์ •๋ณด๋‹น๊ตญ์— ๋ฐ์ดํ„ฐ ๋ถ„์„์ž๋ฃŒ ํŒ”์ง€ ์•Š๊ฒ ๋‹ค\".")
Example #19
0
 def test_ranked(self):
     textrank = TextRank(self.text)
     self.assertEqual(textrank.summarize(1), "ํŠธ์œ„ํ„ฐ, \"์ •๋ณด๋‹น๊ตญ์— ๋ฐ์ดํ„ฐ ๋ถ„์„์ž๋ฃŒ ํŒ”์ง€ ์•Š๊ฒ ๋‹ค\".")
Example #20
0
from lexrankr import LexRank

import asyncio
from contextlib import suppress
import queue

#from html2text import html2text

# workaround preloading kkma in konlpy
# kkma ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ ๋กœ๋”ฉ์ด ์—„์ฒญ ์˜ค๋ž˜ ๊ฑธ๋ฆผ... ( 10์ดˆ? )

#########################################
if jpype.isJVMStarted():
    jpype.attachThreadToJVM()

_textrank = TextRank("test")
#########################################


def summarize_text(text):
    # < workaround cooes > for crash problem with django and konlpy
    # - related issue : https://github.com/konlpy/konlpy/issues/104
    if jpype.isJVMStarted():
        jpype.attachThreadToJVM()
    # < workaround codes end!!! >

    #print_with_timestamp("textrank init")
    _textrank = TextRank(text)
    #print_with_timestamp("textrank init end")
    return _textrank.summarize()
Example #21
0
def summarizeTextRank(text, max=3):
    tr = TextRank(text)
    return tr.summarize(max)
Example #22
0
from __future__ import print_function
from textrankr import TextRank

# ์ด๊ฑฐ ์•„๋‹˜, ์ด๊ฑด ์—”ํ„ฐ๊ฐ€ ์—†์œผ๋ฉด ๊ตฌ๋ถ„์ด ์•ˆ๋จ !

textrank = TextRank('''
์œก์•„์˜ ํ•„์ˆ˜ํ’ˆ์ด ๋œ ๋ฌผํ‹ฐ์Šˆ๊ฐ€ ์•Œ๋ ˆ๋ฅด๊ธฐ์˜ ์›์ธ์ด ๋  ์ˆ˜ ์žˆ๋‹ค๋Š” ์—ฐ๊ตฌ ๊ฒฐ๊ณผ๊ฐ€ ๋‚˜์™”๋‹ค.

๋ฏธ๊ตญ ๋…ธ์Šค์›จ์Šคํ„ด๋Œ€ํ•™๊ต ์—ฐ๊ตฌ์ง„์€ ๋ฌผํ‹ฐ์Šˆ๋กœ ์•„๊ธฐ๋ฅผ ๋‹ฆ์œผ๋ฉด ํ”ผ๋ถ€์— ๋‚จ์€ ๋น„๋ˆ„๊ธฐ(๋ผ์šฐ๋ฆดํ™ฉ์‚ฐ๋‚˜ํŠธ๋ฅจ ๋“ฑ ๊ณ„๋ฉดํ™œ์„ฑ์ œ)๊ฐ€ ๋ณดํ˜ธ๋ง‰ ์—ญํ• ์„ ํ•˜๋Š” ๊ธฐ๋ฆ„๊ธฐ(์ง€์งˆ)๋ฅผ ์—†์•  ์•„๊ธฐ๋“ค์—๊ฒŒ ์•Œ๋ ˆ๋ฅด๊ธฐ๊ฐ€ ์ƒ๊ธฐ๊ธฐ ์‰ฝ๋‹ค๊ณ  ๋ฐํ˜”๋‹ค.

ํŠนํžˆ ์ด๋ฒˆ ์—ฐ๊ตฌ์—์„œ ์ฃผ๋ชฉํ•  ์ ์€ ์œ ์ „์ ์œผ๋กœ ํ”ผ๋ถ€๊ฐ€ ์•ฝํ•œ ์•„๊ธฐ๋“ค์—๊ฒŒ ๋ฌผํ‹ฐ์Šˆ์˜ ํ™”ํ•™๋ฌผ์งˆ์€ ์‹ํ’ˆ ์•Œ๋ ˆ๋ฅด๊ธฐ์˜ ์›์ธ์ด ๋  ์ˆ˜ ์žˆ๋‹ค๋Š” ๊ฒƒ์ด๋‹ค.

์กฐ์•ค ์ฟก ๋ฐ€์Šค ๊ต์ˆ˜๋Š” '์œ ์ „์ž, ๋จผ์ง€์™€ ์Œ์‹, ๊ทธ๋ฆฌ๊ณ  ๋ฌผํ‹ฐ์Šˆ๋Š” ์œ ์•„ ์•Œ๋ ˆ๋ฅด๊ธฐ์˜ '๋” ํ•  ์ˆ˜ ์—†์ด ๋‚˜์œ ์ƒํ™ฉ(perfect storm)'์ด๋‹ค'๋ผ๊ณ  ๋งํ–ˆ๋‹ค.

์—ฐ๊ตฌ์— ๋”ฐ๋ฅด๋ฉด ์•Œ๋ ˆ๋ฅด๊ธฐ๊ฐ€ ์žˆ๋Š” ์•„์ด๋“ค์€ ํ”ผ๋ถ€ ๋ณดํ˜ธ๋ง‰์„ ์•ฝํ•˜๊ฒŒ ํ•˜๋Š” ์„ธ ๊ฐ€์ง€ ๋Œ์—ฐ๋ณ€์ด ์œ ์ „์ž๊ฐ€ ์žˆ์—ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ํ•ด๋‹น ์œ ์ „์ž๋ฅผ ๊ฐ€์กŒ๋‹ค๊ณ  ํ•ด์„œ ๋ชจ๋‘ ์‹ํ’ˆ ์•Œ๋ ˆ๋ฅด๊ธฐ ๋ฐ˜์‘์„ ๋ณด์ธ ๊ฒƒ์€ ์•„๋‹ˆ์—ˆ๋‹ค.

์˜ˆ์ปจ๋Œ€ ํ•ด๋‹น ์œ ์ „์ž๋ฅผ ๊ฐ€์ง„ ์‹คํ—˜์šฉ ์ƒ์ฅ๋ฅผ ๋•…์ฝฉ์— ๋…ธ์ถœํ•ด๋„ ๋ณ„๋‹ค๋ฅธ ๋ฐ˜์‘์ด ๋‚˜ํƒ€๋‚˜์ง€ ์•Š์•˜๋˜ ๊ฒƒ. ์—ฐ๊ตฌ์ง„์€ ํ™˜๊ฒฝ์  ์š”์ธ์— ์ฃผ๋ชฉํ–ˆ๋‹ค. ์‹คํ—˜ ๋์— ์—ฐ๊ตฌ์ง„์ด ์‹ํ’ˆ ์•Œ๋ ˆ๋ฅด๊ธฐ๋ฅผ ์œ ๋ฐœํ•˜๋Š” ์›์ธ์œผ๋กœ ์ง€๋ชฉํ•œ ๊ฒƒ ์ค‘ ํ•˜๋‚˜๊ฐ€ ๋ฌผํ‹ฐ์Šˆ์˜€๋‹ค.

์—ฐ๊ตฌ์ง„์€ ๋ฌผํ‹ฐ์Šˆ์˜ ๋น„๋ˆ„ ์„ฑ๋ถ„์ด ํ”ผ๋ถ€์˜ ๋ณดํ˜ธ๋ง‰์„ ์•ฝํ•˜๊ฒŒ ๋งŒ๋“ค์–ด ํŠนํžˆ ์œ ์ „์ ์œผ๋กœ ์ทจ์•ฝํ•œ ์•„์ด๋“ค์—๊ฒŒ ๋‚˜์œ ์˜ํ–ฅ์„ ๋ฏธ์น˜๋Š” ๊ฒƒ์„ ๋ฐœ๊ฒฌํ–ˆ๋‹ค.

์—ฐ๊ตฌ์ง„์€ ์œ ์ „์  ๊ฒฐํ•จ์ด ์žˆ๋Š” ์•„๊ธฐ๋“ค์˜ ํ”ผ๋ถ€๊ฐ€ ๋ฌผํ‹ฐ์Šˆ์˜ ๋น„๋ˆ„ ์„ฑ๋ถ„์„ ์ ‘์ด‰ํ–ˆ์„ ๋•Œ ๋จผ์ง€๋‚˜ ๋•…์ฝฉ ๋“ฑ ์•Œ๋ ˆ๋ฅด๊ธฐ ๋ฌผ์งˆ์ด ์ฒด๋‚ด์— ์œ ์ž…๋˜๊ธฐ ์‰ฝ๋‹ค๊ณ  ์ง€์ ํ–ˆ๋‹ค. ์ด ๊ฒฝ์šฐ ์˜ˆ์ปจ๋Œ€ ๋•…์ฝฉ๋ฒ„ํ„ฐ ์ƒŒ๋“œ์œ„์น˜๋ฅผ ๋จน์€ ๋ˆ„๊ตฐ๊ฐ€๊ฐ€ ์•„๊ธฐ์—๊ฒŒ ์ ‘์ด‰ํ•˜๋Š” ๊ฒƒ๋งŒ์œผ๋กœ๋„ ์•Œ๋ ˆ๋ฅด๊ธฐ๊ฐ€ ์ƒ๊ธธ ์ˆ˜ ์žˆ๋‹ค๊ณ  ์—ฐ๊ตฌ์ง„์€ ๊ฒฝ๊ณ ํ–ˆ๋‹ค.

์—ฐ๊ตฌ์ง„์€ '์‹ ์„ธ๋Œ€ ๋ถ€๋ชจ๋“ค์ด ๋ฌผํ‹ฐ์Šˆ ์‚ฌ์šฉ์„ ์ค„์ด๋Š” ๊ฒƒ์ด ๋ฐ”๋žŒ์งํ•˜๋‹ค'๋ฉด์„œ '๊ฐ€์žฅ ์ข‹์€ ๋ฐฉ๋ฒ•์€ ์˜ˆ์ „์— ๋ถ€๋ชจ๋“ค์ด ๊ทธ๋žฌ๋“ฏ์ด ๋ฌผ๋กœ ์”ป๊ธฐ๋Š” ๊ฒƒ'์ด๋ผ๊ณ  ๋ง๋ถ™์˜€๋‹ค.
''')
print(textrank.summarize(1))
print(textrank.summarize(1))
Example #23
0
def tokenize(doc):
    return [''.join(t) for t in pos_tagger2.phrases(doc)]


# In[7]:


def term_exists(doc):
    return {'{}'.format(word): (word in set(doc)) for word in tokens}


# In[15]:

text = select_sentence()
textrank = TextRank(text)
main_sentences = textrank.summarize(5).split('\n')
random.shuffle(main_sentences)
main_sentence = main_sentences.pop()
main_sentence2 = main_sentences.pop()
main_sentence3 = main_sentences.pop()

# In[20]:

pos_tagger2 = Okt()

trainSet = select_sentence_by_keyword()
train_docs = [(tokenize(row[0]), row[1]) for row in trainSet]
tokens = [t for d in train_docs for t in d[0]]
train_xy = [(term_exists(d), c) for d, c in train_docs]
Example #24
0
    line_modify = []
    content = ""

    line_modify.append(line[0])
    #line_modify.append(line[1])
    line_modify.append(re.sub('[^0-9a-zA-Zใ„ฑ-ํž— .]', ' ', line[1]))

    # ๊ธฐ์‚ฌ ๋งจ ๋’ค ์ด๋ฉ”์ผ ์‚ญ์ œ
    reversed_content = ''.join(reversed(line[2]))
    for i in range(0, len(line[2])):
        # reverse ๋œ ๊ธฐ์‚ฌ ๋‚ด์šฉ์ค‘, ".๋‹ค"๋กœ ๋๋‚˜๋Š” ๊ฒฝ์šฐ ๊ธฐ์‚ฌ ๋‚ด์šฉ์ด ๋๋‚œ ๊ฒƒ์ด๊ธฐ ๋•Œ๋ฌธ์— ๊ธฐ์‚ฌ ๋‚ด์šฉ์ด ๋๋‚œ ํ›„์˜ ๊ด‘๊ณ , ๊ธฐ์ž ๋“ฑ์˜ ์ •๋ณด๋Š” ๋‹ค ์ง€์›€
        if reversed_content[i:i + 2] == '.๋‹ค':
            content = ''.join(reversed(reversed_content[i:]))
            break

    content = content.replace('.', '. ')
    line_modify.append(re.sub('[^0-9a-zA-Zใ„ฑ-ํž— .]', ' ', content))

    text = TextRank(content)
    text = text.summarize()
    text = re.sub('[^0-9a-zA-Zใ„ฑ-ํž— .]', ' ', text)
    line_modify.append(text)
    print(text)

    line_modify.append(line[3])

    writer.writerow(line_modify)

f.close()
f2.close()
Example #25
0
from __future__ import print_function
from textrankr import TextRank
import sys

#textrank = TextRank("๋‚˜๋Š” ํ”„๋กœ๊ทธ๋ž˜๋จธ ์ž…๋‹ˆ๋‹ค. ์ž๋ฐ” ํ”„๋กœ๊ทธ๋ž˜๋จธ, ๊ทธ๋ƒฅ ํ”„๋กœ๊ทธ๋ž˜๋จธ");
#print(sys.argv[1]);
textrank = TextRank(sys.argv[1])
print(textrank.summarize())