Example #1
0
def post():
    try:
        start = time()
        parser = reqparse.RequestParser()
        parser.add_argument('contents', type=str)
        args = parser.parse_args()

        contents = args['contents']
        print("==== contents ==== : ", contents)

        print(contents)

        # lexrank.summarize(contents)
        # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다."
        lex = LexRank()
        lex.summarize(contents)

        sum = lex.probe(2)

        print(lex.probe(2))

        strd = sum[1] + " " + sum[0]

        resp = {"sum": strd}

        return jsonify(resp)
    except Exception as e:
        return {'error': str(e)}
Example #2
0
def lexrank(rst):
    print("=" * 20)
    print(len(rst))
    print(rst[0])
    print("=" * 20)
    lexInputText = ""
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
    for news in rst:
        lexInputText += str(news['id'])
        parse = hangul.sub('', news['title'])
        parse = re.sub(' +', ' ', parse).strip()
        parse = re.sub('\.', '', parse)
        lexInputText += parse
        parse = hangul.sub('', news['content'])
        parse = re.sub(' +', ' ', parse).strip()
        parse = re.sub('\.', '', parse)
        lexInputText += parse
        lexInputText += ". "

    print("=" * 10 + "LEXRANK" + "=" * 10)
    lexrank = LexRank()
    lexrank.summarize(lexInputText)
    summaries = lexrank.probe(10)
    searchId = re.compile('[0-9]{5,6}')
    idList = []
    for summary in summaries:
        idList.append(searchId.search(summary).group())
    return (idList)
Example #3
0
    def post(self):
        import jpype
        if jpype.isJVMStarted():
            jpype.attachThreadToJVM()
        result = []
        corpus = self.params['corpus']

        if corpus:
            result_count = self.params['result_count']
            data_save = self.params['data_save']

            if data_save.upper() == 'Y':
                self.getDataApi().WordPool.add_word_pool(corpus)

            config = {
                # 'useful_tags': ['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi',
                #                 'Eomi', 'Suffix', 'Alpha', 'Number'],
                'useful_tags': ['Noun', 'ProperNoun'],
                'min_token_length': 5
            }

            lexRank = LexRank(**config)
            lexRank.summarize(corpus)

            result_count = min(result_count, lexRank.num_sentences - 1)
            if result_count == 0:
                result_count = 1

            result = lexRank.probe(result_count)

        return result
Example #4
0
    def insertSumma(self):

        lexrank = LexRank()
        curs = self.conn.cursor()

        summa_total = ""
        sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " ORDER BY id ASC limit 3;"
        curs.execute(sql)
        rows = curs.fetchall()
        for row in rows:
            print("insert summa to id=" + str(row[0]))
            text = "".join(row[1])
            if (lexrank.summarize(text) == 0):
                continue

            if (row[2] <= 20):
                summaries = lexrank.probe(5)
            else:
                summaries = lexrank.probe(10)

            for summa in summaries:
                summa_total += "".join(summa)
                summa_total += ". "

            print(" summa = " + summa_total + "\n")

            curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;",
                         (summa_total, row[0]))
            self.conn.commit()
            summa_total = ""
Example #5
0
    def insertSumma(self):

        lexrank = LexRank()
        curs = self.conn.cursor()

        summa_total = ""
        sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " WHERE sentence_cnt > 5  ORDER BY id ASC;"
        curs.execute(sql)
        rows = curs.fetchall()

        for row in rows:
            raw = row[1]
            sentence_cnt = row[2]
            text = "".join(raw)

            if (lexrank.summarize(text) == 0):
                continue

            if (sentence_cnt <= 20):
                summaries = lexrank.probe(5)
            else:
                summaries = lexrank.probe(10)

            for summa in summaries:
                summa_total += "".join(summa)
                summa_total += ". "

            curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;",
                         (summa_total, row[0]))
            self.conn.commit()
            summa_total = ""
Example #6
0
def summarize_text_with_lexrank(text):
    lexrank = LexRank(n_clusters=1)
    lexrank.summarize(text)
    summaries = lexrank.probe(3)

    if len(summaries) == 0:
        return ""

    return ". ".join(summaries)
	def summarize(self, n_summary):
		print("summarize in %d sentences" %n_summary)
		lexrank = LexRank()
		lexrank.summarize(" ".join(self.preprocessed))
		summaries = lexrank.probe(n_summary)  
		# `num_summaries` can be `None` (using auto-detected topics)

		for summary in summaries:
			print(summary)
		self.summaries = summaries
		return summaries
    def sum_talker(self, cluster):

        per_talker_dict = {}
        lexrank = LexRank(clustering=None)
        for sen in cluster.sentences:
            if sen.talker not in per_talker_dict:
                per_talker_dict[sen.talker] = ""
            per_talker_dict[sen.talker] += sen.text+"\n"

        for k,v in per_talker_dict.items():
            lexrank.summarize(v)
            self.sum_per_talker[k].append(lexrank.probe(1))
 def sum_cluster(self):
     self.clusters = self.clustering()
     lexrank = LexRank(clustering=None)
     self.summaries = []
     for c in self.clusters:
         lexrank.summarize(c.sen2txt())
         for i, s in enumerate(c.sentences):
             print(i, s.text)
         if len(c.sentences)<20:
             self.summaries.append(lexrank.probe(0.3))
         else:
             self.summaries.append(lexrank.probe(0.2))
         self.sum_talker(c) # use current cluster to summarize per talker
Example #10
0
def wordRank(text):
    text = text.replace('\\n', '.')
    new = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text)

    lexrank = LexRank()
    lexrank.summarize(new)
    summaries = lexrank.probe(3)
    word = Twitter()
    out = []
    for summary in summaries:
        out += word.nouns(summary)

    out = list(set(out))
    print(out)
Example #11
0
 def insertSentenceCount(self):
     
     lexrank = LexRank()
     curs = self.conn.cursor()
     
     sql = "SELECT id,raw FROM "+ self.table +" ORDER BY id ASC;"
     curs.execute(sql)
     rows = curs.fetchall()
     
     for row in rows:
         text="".join(row[1])
         sc = lexrank.factory.text2sentences(text)
         print("insert sentence_cnt to id="+str(row[0]))
         print("sc="+str(len(sc))+"\n")
         curs.execute("UPDATE "+ self.table +" SET sentence_cnt=%s WHERE id=%s;",(str(len(sc)),str(row[0])))
         self.conn.commit()
Example #12
0
def videoSumalyze(pk):
    post = get_object_or_404(VideoPost, pk=pk)
    lang = 'en-US'
    if post.lang == '한국어':
        lang = 'ko-KR'
    from lexrankr import LexRank
    lexrank = LexRank()
    chunk = []
    chunk2 = []
    url = post.url
    path = youtubeToAudio(url)
    #post.title = re.sub('[^가-힣\\s]', '', str(path))
    chunk = splitandSTT(path, lang)
    text = " ".join(chunk)
    os.remove(path + '.mp3')
    os.remove(path)
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
Example #13
0
def lexlank_function(result):
    # 참조 : https://wikidocs.net/72820
    # LexRank : https://github.com/theeluwin/lexrankr
    try:
        lexrank = LexRank()
        lexrank.summarize(result)

        summarize_data = []
        print("요약 진행중!")
        summaries = lexrank.probe(10)
        for i, summary in enumerate(summaries):
            summarize_data.append(summary)

        return summarize_data
    except:
        print("요약 내용이 부족합니다.")
        return []
Example #14
0
    def summarize(self):
        lex = LexRank()
        lex.summarize(self.paragraph)

        if len(self.paragraph) < 100:
            self.probe_num = 1
        elif len(self.paragraph) < 200:
            self.probe_num = 2
        elif len(self.paragraph) < 300:
            self.probe_num = 3
        elif len(self.paragraph) < 400:
            self.probe_num = 4
        else:
            self.probe_num = 5

        summaries = lex.probe(self.probe_num)
        return summaries
Example #15
0
def imageSumalyze(pk):
    post = get_object_or_404(ImagePost, pk=pk)
    from lexrankr import LexRank
    lexrank = LexRank()
    path = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__))) + '\\media\\' + str(post.pdf)
    text = img2Text(path)
    os.remove(path)
    chunk = []
    if len(text) < 650:
        chunk.append(text)
    else:
        chunk = list(map(''.join, zip(*[iter(text)] * 650)))
    chunk2 = []
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    post.pdf = None
    post.index = idxToDB
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
Example #16
0
def smry(q):
    while True:
        try:
            print("smry start")
            global count
            data = q.get()
            count += 1
            #print("get{}!".format(count))
            lexrank = LexRank()
            lexrank.summarize(data[4]) #data[4] (본문)가져와서 요약
            summaries = lexrank.probe(3) #3줄요약, summaries 타입은 list
            data[4] = '. '.join(summaries)+'.' #요약된 내용 다시 .으로 join후 저장
            print(data) #db에 저장되어야 하는 최종 결과
            db_store(data)
            # for summary in summaries:
            #     print(summary)
        except (IndexError,ValueError,AttributeError):
            pass
Example #17
0
def audioSumalyze(pk):
    post = get_object_or_404(AudioPost, pk=pk)
    lang = 'en-US'
    if post.lang == '한국어':
        lang = 'ko-KR'
    from lexrankr import LexRank
    lexrank = LexRank()
    chunk2 = []
    chunk = []

    #speechtotext(str(post.pdf), lang, chunk)
    chunk = splitandSTT(path + str(post.pdf), lang)
    text = " ".join(chunk)
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
Example #18
0
def textSumalyze(pk):
    post = get_object_or_404(TextPost, pk=pk)
    from lexrankr import LexRank
    lexrank = LexRank()
    text = post.text
    chunk = []
    if len(text) < 650:
        chunk.append(text)
    else:
        chunk = list(map(''.join, zip(*[iter(text)] * 650)))
    chunk2 = []
    idxToDB = ''
    #요약 적용
    idx = 0
    while idx != (len(chunk)):
        try:
            lexrank.summarize(chunk[idx])
            summaries = lexrank.probe(3)
        except:
            idx += 1
            continue
        summaries[0] = summaries[0] + '. '
        summaries[1] = summaries[1] + '. '
        summaries[2] = summaries[2] + '. '
        indexStr = ''.join(summaries)
        chunk[idx] = indexStr
        idxToDB += ibmIndex(indexStr, summaries)
        idxToDB += '#'
        chunk2.append(chunk[idx])
        idx += 1
    post.index = idxToDB

    chunk = []
    chunkToDB = ''
    for c in chunk2:
        chunkToDB += c + '\n'

    post.content = chunkToDB
    post.text = 'clear'
    # 요약본이 아닌 원본으로 ibm Natural Language Understanding
    post.keyword, post.relevance, post.category_ibm = ibmContent(text)
    post.save()
def lexranker(text,code,date):

    text = text.replace('\\n','.')
    text2 = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text)

    lexrank =LexRank()
    #print(text2)
    lexrank.summarize(text2)
    summaries = lexrank.probe(3)
    word = Twitter()
    out = []
    print(summaries)
    for summary in summaries:
        out += word.nouns(summary)

    word = list(set(out))
    share = Share(code)
    startprice = share.get_open()
    endprice = share.get_price()
    for part in word:
        save_record(part, code, startprice, endprice, date)
Example #20
0
	def highlight(self, threshold=0.5):

		print("highlight : return list of chats and scores ")

		lexrank = LexRank()
		
		lexrank.summarize(" ".join(self.preprocessed))
		lexrank_sentences = [x.text for x in lexrank.sentences]

		scores = lexrank.sentence_score_pair()  
		
		preprocessed = self.preprocessed[:]
		preprocessed = [x.strip().strip(".").strip() for x in preprocessed]
		lex_idx = 0 
		skip_amount = 0
		jump = 0 
		for ts_sentence in self.ts.chat_to_sentence_mapping:
			ts_idx, chat_idxs, sentence = ts_sentence
			
			if lex_idx >= len(scores): break
			
			if len(sentence.strip()) == 0 :
				jump += 1
			else :
				if preprocessed[lex_idx + skip_amount] != scores[lex_idx][1] :
					skip_amount += 1 
				else :					
					scores[lex_idx] = list(scores[lex_idx])
					scores[lex_idx][0] = lex_idx + jump + skip_amount
					scores[lex_idx].append(chat_idxs)
					lex_idx += 1; 
		self.highlight_lexrank = scores[:]
		print("highlight result")
		return_list = self._map_to_chat(self.highlight_lexrank)
		for chat in return_list :
			if chat[0] == 1 : 
				print(chat)
		return return_list
Example #21
0
def getsentence(table):
    
    conn = pymysql.connect(host='localhost',
                           user='******', 
                           password='******',
                           db='KETI',
                           use_unicode=True,
                           charset='utf8') 
    lexrank = LexRank()
    curs = conn.cursor()
        
    #sql = "SELECT id,raw,sentence_cnt FROM "+ table +" where id>179460;"
    sql = "select count(*)*0.1 from "+ table +";"
    curs.execute(sql)
    rows = curs.fetchone()
    percentile = int(rows[0])
    
    sql = "SELECT id,raw,sentence_cnt FROM "+ table +" order by sentence_cnt desc;"
    curs.execute(sql)
    rows = curs.fetchall()
    
    i=0
    
    for row in rows:
       i = i+1
       if i == percentile : 
           percentile_cnt = row[2]
           break;
     
    sql = "SELECT avg(sentence_cnt) FROM "+ table +" where sentence_cnt <="+str(percentile_cnt) +";"
    curs.execute(sql)
    rows = curs.fetchone()
    percentile_avg = int(rows[0])
        
    print i, percentile_cnt, percentile_avg
    ''' 
Example #22
0
from __future__ import print_function
from lexrankr import LexRank
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import sys

filePath = sys.argv[1]
probNumber = sys.argv[2]

lexrank = LexRank()

all_txt = []
with open(filePath, "r", encoding="UTF-8") as f:
    all_txt = f.readlines()

all_string = ""
for i in all_txt:
    all_string += i
with open("sentence_set.txt", "r") as f:
    sent_analy = f.readlines()
    for i in range(len(sent_analy)):
        sent_analy[i] = sent_analy[i].split(' ')
        for j in range(len(sent_analy[i])):
            if '\n' in sent_analy[i][j]:
                sent_analy[i][j] = sent_analy[i][j].replace('\n', '')

token_all_string = pos_tag(word_tokenize(all_string))

process_str = ""
index = 0
while (len(token_all_string)):
Example #23
0
 def setUp(self):
     self.text = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이."
     self.lexrank = LexRank(min_keyword_length=0, no_below_word_count=0, min_cluster_size=1)
Example #24
0
def simpleLexrank(arg):
    lexrank = LexRank()
    lexrank.summarize(arg)
    summaries = lexrank.probe(3)
    return (summaries)
Example #25
0
 def setUp(self) -> None:
     tokenizer: OktTokenizer = OktTokenizer()
     self.lexrank: LexRank = LexRank(tokenizer)
     self.text: str = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이. 빨강 파랑 초록. 파랑 초록 노랑. 노랑 노랑 빨강. 검정 파랑 빨강 초록. /"
Example #26
0
def summarizeLexRank(text, num=3):
    lr = LexRank()
    lr.summarize(text)
    summaries = lr.probe(num)
    return summaries
Example #27
0
from flask import Flask, request, jsonify
from flask_restful import Resource, Api
from flask_restful import reqparse
from lexrankr import LexRank
from time import time

app = Flask(__name__)
api = Api(app)
app.config['JSON_AS_ASCII'] = False

lexrank = LexRank()  # can init with various settings


@app.route('/summary', methods=['POST'])
def post():
    try:
        start = time()
        parser = reqparse.RequestParser()
        parser.add_argument('contents', type=str)
        args = parser.parse_args()

        contents = args['contents']
        print("==== contents ==== : ", contents)

        print(contents)

        # lexrank.summarize(contents)
        # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다."
        lex = LexRank()
        lex.summarize(contents)
Example #28
0
    def makeEvents(self, summaries, rows, x_event):

        lexrank = LexRank()
        events = []
        events_size = 0
        for summa in summaries:

            find = False
            text = {}
            summa = str(summa)
            db_date = ""
            if (summa.find("기자]") != -1):
                summa = self.deleteNoise(summa)

            for row in rows:

                # When Summa value is none
                if (row[2] is None):
                    temp = "".join(row[1])
                    if (temp.find("".join(summa)) != -1):
                        find = True

                        db_raw = str(row[1])
                        db_summa = str(row[2])
                        db_title = str(row[3])
                        db_date = str(row[4])
                        db_sentenceCnt = str(row[6])
                        db_link = str(row[7])

                        if (db_raw.find("기자]") != -1):
                            db_raw = self.deleteNoise(db_raw)
                        if (db_summa.find("기자]") != -1):
                            db_summa = self.deleteNoise(db_summa)
                        if (db_title.find("기자]") != -1):
                            db_title = self.deleteNoise(db_title)

                        text["headline"] = db_title
                        text["text"] = db_raw
                        text["text2"] = db_raw
                        if (db_sentenceCnt > self.summa_count):
                            raw = "".join(row[1])
                            lexrank.summarize(raw)
                            raw_summaries = lexrank.probe(self.summa_count)
                            text3_summa = ""

                            for raw_summa in raw_summaries:
                                text3_summa += str(raw_summa) + ". "

                            if (text3_summa.find("기자]") != -1):
                                text3_summa = self.deleteNoise(text3_summa)
                            text["text3"] = text3_summa
                        else:
                            text["text3"] = ""
                        break

                # When Summa value isn't none
                else:
                    temp = "".join(row[2])
                    if (temp.find("".join(summa)) != -1):
                        find = True

                        db_raw = str(row[1])
                        db_summa = str(row[2])
                        db_title = str(row[3])
                        db_date = str(row[4])
                        db_sentenceCnt = str(row[6])
                        db_link = str(row[7])

                        if (db_raw.find("기자]") != -1):
                            db_raw = self.deleteNoise(db_raw)
                        if (db_summa.find("기자]") != -1):
                            db_summa = self.deleteNoise(db_summa)
                        if (db_title.find("기자]") != -1):
                            db_title = self.deleteNoise(db_title)

                        text["headline"] = db_title
                        text["text"] = db_raw
                        text["text2"] = db_summa

                        if (db_sentenceCnt > self.summa_count):
                            raw = "".join(row[1])
                            lexrank.summarize(raw)
                            raw_summaries = lexrank.probe(self.summa_count)
                            text3_summa = ""

                            for raw_summa in raw_summaries:
                                text3_summa += str(raw_summa) + ". "

                            if (text3_summa.find("기자]") != -1):
                                text3_summa = self.deleteNoise(text3_summa)
                            text["text3"] = text3_summa
                        else:
                            text["text3"] = ""
                        break

            if (find == True):
                event = {
                    "start_date": {
                        "year": db_date.split('-')[0],
                        "month": db_date.split('-')[1],
                        "day": db_date.split('-')[2],
                        "hour": "",
                        "minute": "",
                        "second": "",
                        "millisecond": "",
                        "format": ""
                    },
                    "group": db_date.split('-')[1],
                    "media": {
                        "caption": "",
                        "credit": "",
                        "url": db_link,
                        "thumb": ""
                    }
                }
                event["text"] = text
                events.append(event)
                events_size += 1

            if (events_size == x_event):
                break

        return events
Example #29
0
    def createJSON(self):

        start_time_createJSON = time.time()
        if (self.main_event > len(self.main_idx)):
            sys.exit("main_event must be smaller than main_idx")
        elif ((self.total_event - self.main_event) > len(self.outlier_idx)):
            sys.exit("total_event-main_event must be smaller than outlier_idx")

        lexrank = LexRank()  # can init with various settings
        curs = self.conn.cursor()

        ##############################################################################
        # MAKE TITLE #
        ##############################################################################
        main_outlier_text = ""

        main_outlier_idx = self.main_idx + self.outlier_idx
        main_outlier_idx_size = len(main_outlier_idx)

        tag = ""
        main_outlier_sentence_cnt = 0

        main_outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in("

        for i in range(main_outlier_idx_size):
            if (i != (main_outlier_idx_size - 1)):
                main_outlier_sql += str(main_outlier_idx[i]) + ","
            else:
                main_outlier_sql += str(main_outlier_idx[i]) + ");"

        curs.execute(main_outlier_sql)
        main_outlier_rows = curs.fetchall()

        for row in main_outlier_rows:
            tag += "".join(row[5])
            main_outlier_sentence_cnt += row[6]
            if (row[2] is None):
                main_outlier_text += "".join(row[1])
            else:
                main_outlier_text += "".join(row[2])
        #main_outlier_tag=self.selectTag(tag)

        print(
            "===============================================================")
        print(
            "====================== START CREATE_JSON ======================")
        print(
            "===============================================================\n"
        )

        start_time = time.time()
        lexrank.summarize(main_outlier_text)
        print("first total : ")
        print("     %.02f sec" % (time.time() - start_time))

        summaries = lexrank.probe(self.total_event)

        find = 0
        db_link = ""
        main_outlier_summa = ""
        for summa in summaries:

            summa = str(summa)
            if (summa.find("기자]") != -1):
                summa = self.deleteNoise(summa)

            for row in main_outlier_rows:
                if (row[2] is None):
                    temp = "".join(row[1])
                    if (temp.find("".join(summa)) != -1):
                        db_link = str(row[7])
                        find += 1
                        main_outlier_summa += summa + ". "
                        break
                else:
                    temp = "".join(row[2])
                    if (temp.find("".join(summa)) != -1):

                        db_link = str(row[7])
                        find += 1
                        main_outlier_summa += summa + ". "
                        break

            if (find == (1)):
                break

        main_outlier_tag = self.selectTag(tag, main_outlier_summa)

        total_data = OrderedDict()

        total_data["scale"] = ""
        total_data["title"] = {
            "media": {
                "caption": "",
                "credit": "",
                "url": db_link,
                "thumb": ""
            },
            "text": {
                "headline": main_outlier_tag,
                "text": main_outlier_summa
            }
        }
        ##############################################################################
        # MAKE TITLE # END
        ##############################################################################

        ##############################################################################
        # MAKE EVENTS # START
        ##############################################################################
        events = []

        main_idx_size = len(self.main_idx)
        main_text = ""

        outlier_idx_size = len(self.outlier_idx)
        outlier_text = ""

        main_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in("
        for i in range(main_idx_size):
            if (i != (main_idx_size - 1)):
                main_sql += str(self.main_idx[i]) + ","
            else:
                main_sql += str(self.main_idx[i]) + ");"

        curs.execute(main_sql)
        main_rows = curs.fetchall()
        for row in main_rows:
            if (row[2] is None):
                main_text += "".join(row[1])
            else:
                main_text += "".join(row[2])

        start_time = time.time()
        lexrank.summarize(main_text)
        start_time = time.time()
        summaries = self.getDistinctArticleSummaries(lexrank, main_rows,
                                                     self.main_event)

        start_time = time.time()

        events += self.makeEvents(summaries, main_rows, self.main_event)

        outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in("
        for i in range(outlier_idx_size):
            if (i != (outlier_idx_size - 1)):
                outlier_sql += str(self.outlier_idx[i]) + ","
            else:
                outlier_sql += str(self.outlier_idx[i]) + ");"

        curs.execute(outlier_sql)
        outlier_rows = curs.fetchall()
        for row in outlier_rows:
            if (row[2] is None):
                outlier_text += "".join(row[1])
            else:
                outlier_text += "".join(row[2])

        start_time = time.time()

        lexrank.summarize(outlier_text)

        summaries = self.getDistinctArticleSummaries(
            lexrank, outlier_rows, self.total_event - self.main_event)

        events += self.makeEvents(summaries, outlier_rows,
                                  self.total_event - self.main_event)

        total_data["events"] = events

        print("     <TOTAL TIME> ")
        print("     %.02f sec" % (time.time() - start_time_createJSON) + "\n")
        ##############################################################################
        # MAKE EVENTS # END
        ##############################################################################

        numTag = 3
        fileTag = ""
        for i in range(0, numTag):
            fileTag += "".join(main_outlier_tag.split(' ')[i])
            if (i != numTag - 1):
                fileTag += "-"

        file_path = "JSON/" + fileTag + "_" + str(self.total_event) + ".json"
        with codecs.open(file_path, 'w', encoding="utf-8") as make_file:
            json.dump(total_data,
                      make_file,
                      ensure_ascii=False,
                      indent=4,
                      sort_keys=False)
Example #30
0
    def insertSumma(self):

        lexrank = LexRank()
        curs = self.conn.cursor()

        #1. get a value of 90 percent
        #you can change the percent value like 80, 70 ..
        percent = 90
        sql = "select count(*)*" + str(1 - percent *
                                       0.01) + " from " + self.table + ";"
        curs.execute(sql)
        rows = curs.fetchone()
        percentile = int(rows[0])

        #2. get a value located at a percetile
        sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " order by sentence_cnt desc;"
        curs.execute(sql)
        rows = curs.fetchall()

        i = 0
        for row in rows:
            i = i + 1
            if i == percentile:
                percentile_cnt = row[2]
                break

        #3. get a average value in a percentile
        sql = "SELECT avg(sentence_cnt) FROM " + self.table + " where sentence_cnt <=" + str(
            percentile_cnt) + ";"
        curs.execute(sql)
        rows = curs.fetchone()
        percentile_avg = int(rows[0])

        #the minimum average value is 6
        if percentile_avg < 6:
            percentile_avg = 6

        #4. get the total table row except for less than half of the average
        summa_total = ""
        sql = "SELECT id,raw,sentence_cnt FROM KETI." + self.table + " WHERE sentence_cnt > " + str(
            int(percentile_avg / 2)) + " ORDER BY id ASC;"
        curs.execute(sql)
        rows = curs.fetchall()

        #5. we can get raw and sentence count in each row
        # and update summa column using the parameter rule

        for row in rows:
            #using the curs as iterator
            raw = row[1]
            sentence_cnt = row[2]
            text = "".join(raw)

            if (lexrank.summarize(text) == 0):
                continue

            if (sentence_cnt <= percentile_cnt):
                summaries = lexrank.probe(int(percentile_avg / 2))
            else:
                summaries = lexrank.probe(percentile_avg)

            for summa in summaries:
                summa_total += "".join(summa)
                summa_total += ". "

                curs.execute(
                    "UPDATE " + self.table + " SET summa=%s WHERE id=%s;",
                    (summa_total, row[0]))
            self.conn.commit()

            summa_total = ""