def post(): try: start = time() parser = reqparse.RequestParser() parser.add_argument('contents', type=str) args = parser.parse_args() contents = args['contents'] print("==== contents ==== : ", contents) print(contents) # lexrank.summarize(contents) # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다." lex = LexRank() lex.summarize(contents) sum = lex.probe(2) print(lex.probe(2)) strd = sum[1] + " " + sum[0] resp = {"sum": strd} return jsonify(resp) except Exception as e: return {'error': str(e)}
def lexrank(rst): print("=" * 20) print(len(rst)) print(rst[0]) print("=" * 20) lexInputText = "" hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') for news in rst: lexInputText += str(news['id']) parse = hangul.sub('', news['title']) parse = re.sub(' +', ' ', parse).strip() parse = re.sub('\.', '', parse) lexInputText += parse parse = hangul.sub('', news['content']) parse = re.sub(' +', ' ', parse).strip() parse = re.sub('\.', '', parse) lexInputText += parse lexInputText += ". " print("=" * 10 + "LEXRANK" + "=" * 10) lexrank = LexRank() lexrank.summarize(lexInputText) summaries = lexrank.probe(10) searchId = re.compile('[0-9]{5,6}') idList = [] for summary in summaries: idList.append(searchId.search(summary).group()) return (idList)
def post(self): import jpype if jpype.isJVMStarted(): jpype.attachThreadToJVM() result = [] corpus = self.params['corpus'] if corpus: result_count = self.params['result_count'] data_save = self.params['data_save'] if data_save.upper() == 'Y': self.getDataApi().WordPool.add_word_pool(corpus) config = { # 'useful_tags': ['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', # 'Eomi', 'Suffix', 'Alpha', 'Number'], 'useful_tags': ['Noun', 'ProperNoun'], 'min_token_length': 5 } lexRank = LexRank(**config) lexRank.summarize(corpus) result_count = min(result_count, lexRank.num_sentences - 1) if result_count == 0: result_count = 1 result = lexRank.probe(result_count) return result
def insertSumma(self): lexrank = LexRank() curs = self.conn.cursor() summa_total = "" sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " ORDER BY id ASC limit 3;" curs.execute(sql) rows = curs.fetchall() for row in rows: print("insert summa to id=" + str(row[0])) text = "".join(row[1]) if (lexrank.summarize(text) == 0): continue if (row[2] <= 20): summaries = lexrank.probe(5) else: summaries = lexrank.probe(10) for summa in summaries: summa_total += "".join(summa) summa_total += ". " print(" summa = " + summa_total + "\n") curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;", (summa_total, row[0])) self.conn.commit() summa_total = ""
def insertSumma(self): lexrank = LexRank() curs = self.conn.cursor() summa_total = "" sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " WHERE sentence_cnt > 5 ORDER BY id ASC;" curs.execute(sql) rows = curs.fetchall() for row in rows: raw = row[1] sentence_cnt = row[2] text = "".join(raw) if (lexrank.summarize(text) == 0): continue if (sentence_cnt <= 20): summaries = lexrank.probe(5) else: summaries = lexrank.probe(10) for summa in summaries: summa_total += "".join(summa) summa_total += ". " curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;", (summa_total, row[0])) self.conn.commit() summa_total = ""
def summarize_text_with_lexrank(text): lexrank = LexRank(n_clusters=1) lexrank.summarize(text) summaries = lexrank.probe(3) if len(summaries) == 0: return "" return ". ".join(summaries)
def summarize(self, n_summary): print("summarize in %d sentences" %n_summary) lexrank = LexRank() lexrank.summarize(" ".join(self.preprocessed)) summaries = lexrank.probe(n_summary) # `num_summaries` can be `None` (using auto-detected topics) for summary in summaries: print(summary) self.summaries = summaries return summaries
def sum_talker(self, cluster): per_talker_dict = {} lexrank = LexRank(clustering=None) for sen in cluster.sentences: if sen.talker not in per_talker_dict: per_talker_dict[sen.talker] = "" per_talker_dict[sen.talker] += sen.text+"\n" for k,v in per_talker_dict.items(): lexrank.summarize(v) self.sum_per_talker[k].append(lexrank.probe(1))
def sum_cluster(self): self.clusters = self.clustering() lexrank = LexRank(clustering=None) self.summaries = [] for c in self.clusters: lexrank.summarize(c.sen2txt()) for i, s in enumerate(c.sentences): print(i, s.text) if len(c.sentences)<20: self.summaries.append(lexrank.probe(0.3)) else: self.summaries.append(lexrank.probe(0.2)) self.sum_talker(c) # use current cluster to summarize per talker
def wordRank(text): text = text.replace('\\n', '.') new = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text) lexrank = LexRank() lexrank.summarize(new) summaries = lexrank.probe(3) word = Twitter() out = [] for summary in summaries: out += word.nouns(summary) out = list(set(out)) print(out)
def insertSentenceCount(self): lexrank = LexRank() curs = self.conn.cursor() sql = "SELECT id,raw FROM "+ self.table +" ORDER BY id ASC;" curs.execute(sql) rows = curs.fetchall() for row in rows: text="".join(row[1]) sc = lexrank.factory.text2sentences(text) print("insert sentence_cnt to id="+str(row[0])) print("sc="+str(len(sc))+"\n") curs.execute("UPDATE "+ self.table +" SET sentence_cnt=%s WHERE id=%s;",(str(len(sc)),str(row[0]))) self.conn.commit()
def videoSumalyze(pk): post = get_object_or_404(VideoPost, pk=pk) lang = 'en-US' if post.lang == '한국어': lang = 'ko-KR' from lexrankr import LexRank lexrank = LexRank() chunk = [] chunk2 = [] url = post.url path = youtubeToAudio(url) #post.title = re.sub('[^가-힣\\s]', '', str(path)) chunk = splitandSTT(path, lang) text = " ".join(chunk) os.remove(path + '.mp3') os.remove(path) idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def lexlank_function(result): # 참조 : https://wikidocs.net/72820 # LexRank : https://github.com/theeluwin/lexrankr try: lexrank = LexRank() lexrank.summarize(result) summarize_data = [] print("요약 진행중!") summaries = lexrank.probe(10) for i, summary in enumerate(summaries): summarize_data.append(summary) return summarize_data except: print("요약 내용이 부족합니다.") return []
def summarize(self): lex = LexRank() lex.summarize(self.paragraph) if len(self.paragraph) < 100: self.probe_num = 1 elif len(self.paragraph) < 200: self.probe_num = 2 elif len(self.paragraph) < 300: self.probe_num = 3 elif len(self.paragraph) < 400: self.probe_num = 4 else: self.probe_num = 5 summaries = lex.probe(self.probe_num) return summaries
def imageSumalyze(pk): post = get_object_or_404(ImagePost, pk=pk) from lexrankr import LexRank lexrank = LexRank() path = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '\\media\\' + str(post.pdf) text = img2Text(path) os.remove(path) chunk = [] if len(text) < 650: chunk.append(text) else: chunk = list(map(''.join, zip(*[iter(text)] * 650))) chunk2 = [] idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB post.pdf = None post.index = idxToDB # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def smry(q): while True: try: print("smry start") global count data = q.get() count += 1 #print("get{}!".format(count)) lexrank = LexRank() lexrank.summarize(data[4]) #data[4] (본문)가져와서 요약 summaries = lexrank.probe(3) #3줄요약, summaries 타입은 list data[4] = '. '.join(summaries)+'.' #요약된 내용 다시 .으로 join후 저장 print(data) #db에 저장되어야 하는 최종 결과 db_store(data) # for summary in summaries: # print(summary) except (IndexError,ValueError,AttributeError): pass
def audioSumalyze(pk): post = get_object_or_404(AudioPost, pk=pk) lang = 'en-US' if post.lang == '한국어': lang = 'ko-KR' from lexrankr import LexRank lexrank = LexRank() chunk2 = [] chunk = [] #speechtotext(str(post.pdf), lang, chunk) chunk = splitandSTT(path + str(post.pdf), lang) text = " ".join(chunk) idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def textSumalyze(pk): post = get_object_or_404(TextPost, pk=pk) from lexrankr import LexRank lexrank = LexRank() text = post.text chunk = [] if len(text) < 650: chunk.append(text) else: chunk = list(map(''.join, zip(*[iter(text)] * 650))) chunk2 = [] idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB post.text = 'clear' # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def lexranker(text,code,date): text = text.replace('\\n','.') text2 = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text) lexrank =LexRank() #print(text2) lexrank.summarize(text2) summaries = lexrank.probe(3) word = Twitter() out = [] print(summaries) for summary in summaries: out += word.nouns(summary) word = list(set(out)) share = Share(code) startprice = share.get_open() endprice = share.get_price() for part in word: save_record(part, code, startprice, endprice, date)
def highlight(self, threshold=0.5): print("highlight : return list of chats and scores ") lexrank = LexRank() lexrank.summarize(" ".join(self.preprocessed)) lexrank_sentences = [x.text for x in lexrank.sentences] scores = lexrank.sentence_score_pair() preprocessed = self.preprocessed[:] preprocessed = [x.strip().strip(".").strip() for x in preprocessed] lex_idx = 0 skip_amount = 0 jump = 0 for ts_sentence in self.ts.chat_to_sentence_mapping: ts_idx, chat_idxs, sentence = ts_sentence if lex_idx >= len(scores): break if len(sentence.strip()) == 0 : jump += 1 else : if preprocessed[lex_idx + skip_amount] != scores[lex_idx][1] : skip_amount += 1 else : scores[lex_idx] = list(scores[lex_idx]) scores[lex_idx][0] = lex_idx + jump + skip_amount scores[lex_idx].append(chat_idxs) lex_idx += 1; self.highlight_lexrank = scores[:] print("highlight result") return_list = self._map_to_chat(self.highlight_lexrank) for chat in return_list : if chat[0] == 1 : print(chat) return return_list
def getsentence(table): conn = pymysql.connect(host='localhost', user='******', password='******', db='KETI', use_unicode=True, charset='utf8') lexrank = LexRank() curs = conn.cursor() #sql = "SELECT id,raw,sentence_cnt FROM "+ table +" where id>179460;" sql = "select count(*)*0.1 from "+ table +";" curs.execute(sql) rows = curs.fetchone() percentile = int(rows[0]) sql = "SELECT id,raw,sentence_cnt FROM "+ table +" order by sentence_cnt desc;" curs.execute(sql) rows = curs.fetchall() i=0 for row in rows: i = i+1 if i == percentile : percentile_cnt = row[2] break; sql = "SELECT avg(sentence_cnt) FROM "+ table +" where sentence_cnt <="+str(percentile_cnt) +";" curs.execute(sql) rows = curs.fetchone() percentile_avg = int(rows[0]) print i, percentile_cnt, percentile_avg '''
from __future__ import print_function from lexrankr import LexRank from nltk.tokenize import word_tokenize from nltk.tag import pos_tag import sys filePath = sys.argv[1] probNumber = sys.argv[2] lexrank = LexRank() all_txt = [] with open(filePath, "r", encoding="UTF-8") as f: all_txt = f.readlines() all_string = "" for i in all_txt: all_string += i with open("sentence_set.txt", "r") as f: sent_analy = f.readlines() for i in range(len(sent_analy)): sent_analy[i] = sent_analy[i].split(' ') for j in range(len(sent_analy[i])): if '\n' in sent_analy[i][j]: sent_analy[i][j] = sent_analy[i][j].replace('\n', '') token_all_string = pos_tag(word_tokenize(all_string)) process_str = "" index = 0 while (len(token_all_string)):
def setUp(self): self.text = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이." self.lexrank = LexRank(min_keyword_length=0, no_below_word_count=0, min_cluster_size=1)
def simpleLexrank(arg): lexrank = LexRank() lexrank.summarize(arg) summaries = lexrank.probe(3) return (summaries)
def setUp(self) -> None: tokenizer: OktTokenizer = OktTokenizer() self.lexrank: LexRank = LexRank(tokenizer) self.text: str = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이. 빨강 파랑 초록. 파랑 초록 노랑. 노랑 노랑 빨강. 검정 파랑 빨강 초록. /"
def summarizeLexRank(text, num=3): lr = LexRank() lr.summarize(text) summaries = lr.probe(num) return summaries
from flask import Flask, request, jsonify from flask_restful import Resource, Api from flask_restful import reqparse from lexrankr import LexRank from time import time app = Flask(__name__) api = Api(app) app.config['JSON_AS_ASCII'] = False lexrank = LexRank() # can init with various settings @app.route('/summary', methods=['POST']) def post(): try: start = time() parser = reqparse.RequestParser() parser.add_argument('contents', type=str) args = parser.parse_args() contents = args['contents'] print("==== contents ==== : ", contents) print(contents) # lexrank.summarize(contents) # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다." lex = LexRank() lex.summarize(contents)
def makeEvents(self, summaries, rows, x_event): lexrank = LexRank() events = [] events_size = 0 for summa in summaries: find = False text = {} summa = str(summa) db_date = "" if (summa.find("기자]") != -1): summa = self.deleteNoise(summa) for row in rows: # When Summa value is none if (row[2] is None): temp = "".join(row[1]) if (temp.find("".join(summa)) != -1): find = True db_raw = str(row[1]) db_summa = str(row[2]) db_title = str(row[3]) db_date = str(row[4]) db_sentenceCnt = str(row[6]) db_link = str(row[7]) if (db_raw.find("기자]") != -1): db_raw = self.deleteNoise(db_raw) if (db_summa.find("기자]") != -1): db_summa = self.deleteNoise(db_summa) if (db_title.find("기자]") != -1): db_title = self.deleteNoise(db_title) text["headline"] = db_title text["text"] = db_raw text["text2"] = db_raw if (db_sentenceCnt > self.summa_count): raw = "".join(row[1]) lexrank.summarize(raw) raw_summaries = lexrank.probe(self.summa_count) text3_summa = "" for raw_summa in raw_summaries: text3_summa += str(raw_summa) + ". " if (text3_summa.find("기자]") != -1): text3_summa = self.deleteNoise(text3_summa) text["text3"] = text3_summa else: text["text3"] = "" break # When Summa value isn't none else: temp = "".join(row[2]) if (temp.find("".join(summa)) != -1): find = True db_raw = str(row[1]) db_summa = str(row[2]) db_title = str(row[3]) db_date = str(row[4]) db_sentenceCnt = str(row[6]) db_link = str(row[7]) if (db_raw.find("기자]") != -1): db_raw = self.deleteNoise(db_raw) if (db_summa.find("기자]") != -1): db_summa = self.deleteNoise(db_summa) if (db_title.find("기자]") != -1): db_title = self.deleteNoise(db_title) text["headline"] = db_title text["text"] = db_raw text["text2"] = db_summa if (db_sentenceCnt > self.summa_count): raw = "".join(row[1]) lexrank.summarize(raw) raw_summaries = lexrank.probe(self.summa_count) text3_summa = "" for raw_summa in raw_summaries: text3_summa += str(raw_summa) + ". " if (text3_summa.find("기자]") != -1): text3_summa = self.deleteNoise(text3_summa) text["text3"] = text3_summa else: text["text3"] = "" break if (find == True): event = { "start_date": { "year": db_date.split('-')[0], "month": db_date.split('-')[1], "day": db_date.split('-')[2], "hour": "", "minute": "", "second": "", "millisecond": "", "format": "" }, "group": db_date.split('-')[1], "media": { "caption": "", "credit": "", "url": db_link, "thumb": "" } } event["text"] = text events.append(event) events_size += 1 if (events_size == x_event): break return events
def createJSON(self): start_time_createJSON = time.time() if (self.main_event > len(self.main_idx)): sys.exit("main_event must be smaller than main_idx") elif ((self.total_event - self.main_event) > len(self.outlier_idx)): sys.exit("total_event-main_event must be smaller than outlier_idx") lexrank = LexRank() # can init with various settings curs = self.conn.cursor() ############################################################################## # MAKE TITLE # ############################################################################## main_outlier_text = "" main_outlier_idx = self.main_idx + self.outlier_idx main_outlier_idx_size = len(main_outlier_idx) tag = "" main_outlier_sentence_cnt = 0 main_outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in(" for i in range(main_outlier_idx_size): if (i != (main_outlier_idx_size - 1)): main_outlier_sql += str(main_outlier_idx[i]) + "," else: main_outlier_sql += str(main_outlier_idx[i]) + ");" curs.execute(main_outlier_sql) main_outlier_rows = curs.fetchall() for row in main_outlier_rows: tag += "".join(row[5]) main_outlier_sentence_cnt += row[6] if (row[2] is None): main_outlier_text += "".join(row[1]) else: main_outlier_text += "".join(row[2]) #main_outlier_tag=self.selectTag(tag) print( "===============================================================") print( "====================== START CREATE_JSON ======================") print( "===============================================================\n" ) start_time = time.time() lexrank.summarize(main_outlier_text) print("first total : ") print(" %.02f sec" % (time.time() - start_time)) summaries = lexrank.probe(self.total_event) find = 0 db_link = "" main_outlier_summa = "" for summa in summaries: summa = str(summa) if (summa.find("기자]") != -1): summa = self.deleteNoise(summa) for row in main_outlier_rows: if (row[2] is None): temp = "".join(row[1]) if (temp.find("".join(summa)) != -1): db_link = str(row[7]) find += 1 main_outlier_summa += summa + ". " break else: temp = "".join(row[2]) if (temp.find("".join(summa)) != -1): db_link = str(row[7]) find += 1 main_outlier_summa += summa + ". " break if (find == (1)): break main_outlier_tag = self.selectTag(tag, main_outlier_summa) total_data = OrderedDict() total_data["scale"] = "" total_data["title"] = { "media": { "caption": "", "credit": "", "url": db_link, "thumb": "" }, "text": { "headline": main_outlier_tag, "text": main_outlier_summa } } ############################################################################## # MAKE TITLE # END ############################################################################## ############################################################################## # MAKE EVENTS # START ############################################################################## events = [] main_idx_size = len(self.main_idx) main_text = "" outlier_idx_size = len(self.outlier_idx) outlier_text = "" main_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in(" for i in range(main_idx_size): if (i != (main_idx_size - 1)): main_sql += str(self.main_idx[i]) + "," else: main_sql += str(self.main_idx[i]) + ");" curs.execute(main_sql) main_rows = curs.fetchall() for row in main_rows: if (row[2] is None): main_text += "".join(row[1]) else: main_text += "".join(row[2]) start_time = time.time() lexrank.summarize(main_text) start_time = time.time() summaries = self.getDistinctArticleSummaries(lexrank, main_rows, self.main_event) start_time = time.time() events += self.makeEvents(summaries, main_rows, self.main_event) outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in(" for i in range(outlier_idx_size): if (i != (outlier_idx_size - 1)): outlier_sql += str(self.outlier_idx[i]) + "," else: outlier_sql += str(self.outlier_idx[i]) + ");" curs.execute(outlier_sql) outlier_rows = curs.fetchall() for row in outlier_rows: if (row[2] is None): outlier_text += "".join(row[1]) else: outlier_text += "".join(row[2]) start_time = time.time() lexrank.summarize(outlier_text) summaries = self.getDistinctArticleSummaries( lexrank, outlier_rows, self.total_event - self.main_event) events += self.makeEvents(summaries, outlier_rows, self.total_event - self.main_event) total_data["events"] = events print(" <TOTAL TIME> ") print(" %.02f sec" % (time.time() - start_time_createJSON) + "\n") ############################################################################## # MAKE EVENTS # END ############################################################################## numTag = 3 fileTag = "" for i in range(0, numTag): fileTag += "".join(main_outlier_tag.split(' ')[i]) if (i != numTag - 1): fileTag += "-" file_path = "JSON/" + fileTag + "_" + str(self.total_event) + ".json" with codecs.open(file_path, 'w', encoding="utf-8") as make_file: json.dump(total_data, make_file, ensure_ascii=False, indent=4, sort_keys=False)
def insertSumma(self): lexrank = LexRank() curs = self.conn.cursor() #1. get a value of 90 percent #you can change the percent value like 80, 70 .. percent = 90 sql = "select count(*)*" + str(1 - percent * 0.01) + " from " + self.table + ";" curs.execute(sql) rows = curs.fetchone() percentile = int(rows[0]) #2. get a value located at a percetile sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " order by sentence_cnt desc;" curs.execute(sql) rows = curs.fetchall() i = 0 for row in rows: i = i + 1 if i == percentile: percentile_cnt = row[2] break #3. get a average value in a percentile sql = "SELECT avg(sentence_cnt) FROM " + self.table + " where sentence_cnt <=" + str( percentile_cnt) + ";" curs.execute(sql) rows = curs.fetchone() percentile_avg = int(rows[0]) #the minimum average value is 6 if percentile_avg < 6: percentile_avg = 6 #4. get the total table row except for less than half of the average summa_total = "" sql = "SELECT id,raw,sentence_cnt FROM KETI." + self.table + " WHERE sentence_cnt > " + str( int(percentile_avg / 2)) + " ORDER BY id ASC;" curs.execute(sql) rows = curs.fetchall() #5. we can get raw and sentence count in each row # and update summa column using the parameter rule for row in rows: #using the curs as iterator raw = row[1] sentence_cnt = row[2] text = "".join(raw) if (lexrank.summarize(text) == 0): continue if (sentence_cnt <= percentile_cnt): summaries = lexrank.probe(int(percentile_avg / 2)) else: summaries = lexrank.probe(percentile_avg) for summa in summaries: summa_total += "".join(summa) summa_total += ". " curs.execute( "UPDATE " + self.table + " SET summa=%s WHERE id=%s;", (summa_total, row[0])) self.conn.commit() summa_total = ""