def insertSumma(self): lexrank = LexRank() curs = self.conn.cursor() summa_total = "" sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " WHERE sentence_cnt > 5 ORDER BY id ASC;" curs.execute(sql) rows = curs.fetchall() for row in rows: raw = row[1] sentence_cnt = row[2] text = "".join(raw) if (lexrank.summarize(text) == 0): continue if (sentence_cnt <= 20): summaries = lexrank.probe(5) else: summaries = lexrank.probe(10) for summa in summaries: summa_total += "".join(summa) summa_total += ". " curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;", (summa_total, row[0])) self.conn.commit() summa_total = ""
def post(self): import jpype if jpype.isJVMStarted(): jpype.attachThreadToJVM() result = [] corpus = self.params['corpus'] if corpus: result_count = self.params['result_count'] data_save = self.params['data_save'] if data_save.upper() == 'Y': self.getDataApi().WordPool.add_word_pool(corpus) config = { # 'useful_tags': ['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', # 'Eomi', 'Suffix', 'Alpha', 'Number'], 'useful_tags': ['Noun', 'ProperNoun'], 'min_token_length': 5 } lexRank = LexRank(**config) lexRank.summarize(corpus) result_count = min(result_count, lexRank.num_sentences - 1) if result_count == 0: result_count = 1 result = lexRank.probe(result_count) return result
def lexrank(rst): print("=" * 20) print(len(rst)) print(rst[0]) print("=" * 20) lexInputText = "" hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') for news in rst: lexInputText += str(news['id']) parse = hangul.sub('', news['title']) parse = re.sub(' +', ' ', parse).strip() parse = re.sub('\.', '', parse) lexInputText += parse parse = hangul.sub('', news['content']) parse = re.sub(' +', ' ', parse).strip() parse = re.sub('\.', '', parse) lexInputText += parse lexInputText += ". " print("=" * 10 + "LEXRANK" + "=" * 10) lexrank = LexRank() lexrank.summarize(lexInputText) summaries = lexrank.probe(10) searchId = re.compile('[0-9]{5,6}') idList = [] for summary in summaries: idList.append(searchId.search(summary).group()) return (idList)
def insertSumma(self): lexrank = LexRank() curs = self.conn.cursor() summa_total = "" sql = "SELECT id,raw,sentence_cnt FROM " + self.table + " ORDER BY id ASC limit 3;" curs.execute(sql) rows = curs.fetchall() for row in rows: print("insert summa to id=" + str(row[0])) text = "".join(row[1]) if (lexrank.summarize(text) == 0): continue if (row[2] <= 20): summaries = lexrank.probe(5) else: summaries = lexrank.probe(10) for summa in summaries: summa_total += "".join(summa) summa_total += ". " print(" summa = " + summa_total + "\n") curs.execute("UPDATE " + self.table + " SET summa=%s WHERE id=%s;", (summa_total, row[0])) self.conn.commit() summa_total = ""
def summarize_text_with_lexrank(text): lexrank = LexRank(n_clusters=1) lexrank.summarize(text) summaries = lexrank.probe(3) if len(summaries) == 0: return "" return ". ".join(summaries)
def summarize(self, n_summary): print("summarize in %d sentences" %n_summary) lexrank = LexRank() lexrank.summarize(" ".join(self.preprocessed)) summaries = lexrank.probe(n_summary) # `num_summaries` can be `None` (using auto-detected topics) for summary in summaries: print(summary) self.summaries = summaries return summaries
class TestLexRank(unittest.TestCase): def setUp(self): self.text = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이." self.lexrank = LexRank(min_keyword_length=0, no_below_word_count=0, min_cluster_size=1) def test_summarized(self): self.lexrank.summarize(self.text) summaries = self.lexrank.probe() self.assertEqual(len(summaries), 2) self.assertEqual(summaries[0], "배 감 귤 수박")
def sum_talker(self, cluster): per_talker_dict = {} lexrank = LexRank(clustering=None) for sen in cluster.sentences: if sen.talker not in per_talker_dict: per_talker_dict[sen.talker] = "" per_talker_dict[sen.talker] += sen.text+"\n" for k,v in per_talker_dict.items(): lexrank.summarize(v) self.sum_per_talker[k].append(lexrank.probe(1))
def wordRank(text): text = text.replace('\\n', '.') new = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text) lexrank = LexRank() lexrank.summarize(new) summaries = lexrank.probe(3) word = Twitter() out = [] for summary in summaries: out += word.nouns(summary) out = list(set(out)) print(out)
def imageSumalyze(pk): post = get_object_or_404(ImagePost, pk=pk) from lexrankr import LexRank lexrank = LexRank() path = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '\\media\\' + str(post.pdf) text = img2Text(path) os.remove(path) chunk = [] if len(text) < 650: chunk.append(text) else: chunk = list(map(''.join, zip(*[iter(text)] * 650))) chunk2 = [] idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB post.pdf = None post.index = idxToDB # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def lexlank_function(result): # 참조 : https://wikidocs.net/72820 # LexRank : https://github.com/theeluwin/lexrankr try: lexrank = LexRank() lexrank.summarize(result) summarize_data = [] print("요약 진행중!") summaries = lexrank.probe(10) for i, summary in enumerate(summaries): summarize_data.append(summary) return summarize_data except: print("요약 내용이 부족합니다.") return []
def videoSumalyze(pk): post = get_object_or_404(VideoPost, pk=pk) lang = 'en-US' if post.lang == '한국어': lang = 'ko-KR' from lexrankr import LexRank lexrank = LexRank() chunk = [] chunk2 = [] url = post.url path = youtubeToAudio(url) #post.title = re.sub('[^가-힣\\s]', '', str(path)) chunk = splitandSTT(path, lang) text = " ".join(chunk) os.remove(path + '.mp3') os.remove(path) idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def summarize(self): lex = LexRank() lex.summarize(self.paragraph) if len(self.paragraph) < 100: self.probe_num = 1 elif len(self.paragraph) < 200: self.probe_num = 2 elif len(self.paragraph) < 300: self.probe_num = 3 elif len(self.paragraph) < 400: self.probe_num = 4 else: self.probe_num = 5 summaries = lex.probe(self.probe_num) return summaries
class LexRankforSummarization(): def __init__(self): self.lexrank = LexRank(min_keyword_length=0, no_below_word_count=0, min_cluster_size=1) def test_summarized(self, text): self.lexrank.summarize(text) try: summaries = self.lexrank.probe(3) except: summaries = self.lexrank.probe(2) result = [] for summary in summaries: result.append(summary) # print("result",result) return result
def smry(q): while True: try: print("smry start") global count data = q.get() count += 1 #print("get{}!".format(count)) lexrank = LexRank() lexrank.summarize(data[4]) #data[4] (본문)가져와서 요약 summaries = lexrank.probe(3) #3줄요약, summaries 타입은 list data[4] = '. '.join(summaries)+'.' #요약된 내용 다시 .으로 join후 저장 print(data) #db에 저장되어야 하는 최종 결과 db_store(data) # for summary in summaries: # print(summary) except (IndexError,ValueError,AttributeError): pass
def audioSumalyze(pk): post = get_object_or_404(AudioPost, pk=pk) lang = 'en-US' if post.lang == '한국어': lang = 'ko-KR' from lexrankr import LexRank lexrank = LexRank() chunk2 = [] chunk = [] #speechtotext(str(post.pdf), lang, chunk) chunk = splitandSTT(path + str(post.pdf), lang) text = " ".join(chunk) idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def textSumalyze(pk): post = get_object_or_404(TextPost, pk=pk) from lexrankr import LexRank lexrank = LexRank() text = post.text chunk = [] if len(text) < 650: chunk.append(text) else: chunk = list(map(''.join, zip(*[iter(text)] * 650))) chunk2 = [] idxToDB = '' #요약 적용 idx = 0 while idx != (len(chunk)): try: lexrank.summarize(chunk[idx]) summaries = lexrank.probe(3) except: idx += 1 continue summaries[0] = summaries[0] + '. ' summaries[1] = summaries[1] + '. ' summaries[2] = summaries[2] + '. ' indexStr = ''.join(summaries) chunk[idx] = indexStr idxToDB += ibmIndex(indexStr, summaries) idxToDB += '#' chunk2.append(chunk[idx]) idx += 1 post.index = idxToDB chunk = [] chunkToDB = '' for c in chunk2: chunkToDB += c + '\n' post.content = chunkToDB post.text = 'clear' # 요약본이 아닌 원본으로 ibm Natural Language Understanding post.keyword, post.relevance, post.category_ibm = ibmContent(text) post.save()
def lexranker(text,code,date): text = text.replace('\\n','.') text2 = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text) lexrank =LexRank() #print(text2) lexrank.summarize(text2) summaries = lexrank.probe(3) word = Twitter() out = [] print(summaries) for summary in summaries: out += word.nouns(summary) word = list(set(out)) share = Share(code) startprice = share.get_open() endprice = share.get_price() for part in word: save_record(part, code, startprice, endprice, date)
def post(): try: start = time() parser = reqparse.RequestParser() parser.add_argument('contents', type=str) args = parser.parse_args() contents = args['contents'] print("==== contents ==== : ", contents) print(contents) # lexrank.summarize(contents) # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다." lex = LexRank() lex.summarize(contents) sum = lex.probe(2) print(lex.probe(2)) strd = sum[1] + " " + sum[0] resp = {"sum": strd} return jsonify(resp) except Exception as e: return {'error': str(e)}
def highlight(self, threshold=0.5): print("highlight : return list of chats and scores ") lexrank = LexRank() lexrank.summarize(" ".join(self.preprocessed)) lexrank_sentences = [x.text for x in lexrank.sentences] scores = lexrank.sentence_score_pair() preprocessed = self.preprocessed[:] preprocessed = [x.strip().strip(".").strip() for x in preprocessed] lex_idx = 0 skip_amount = 0 jump = 0 for ts_sentence in self.ts.chat_to_sentence_mapping: ts_idx, chat_idxs, sentence = ts_sentence if lex_idx >= len(scores): break if len(sentence.strip()) == 0 : jump += 1 else : if preprocessed[lex_idx + skip_amount] != scores[lex_idx][1] : skip_amount += 1 else : scores[lex_idx] = list(scores[lex_idx]) scores[lex_idx][0] = lex_idx + jump + skip_amount scores[lex_idx].append(chat_idxs) lex_idx += 1; self.highlight_lexrank = scores[:] print("highlight result") return_list = self._map_to_chat(self.highlight_lexrank) for chat in return_list : if chat[0] == 1 : print(chat) return return_list
def insertSentenceCount(self): lexrank = LexRank() curs = self.conn.cursor() sql = "SELECT id,raw FROM "+ self.table +" ORDER BY id ASC;" curs.execute(sql) rows = curs.fetchall() for row in rows: text="".join(row[1]) sc = lexrank.factory.text2sentences(text) print("insert sentence_cnt to id="+str(row[0])) print("sc="+str(len(sc))+"\n") curs.execute("UPDATE "+ self.table +" SET sentence_cnt=%s WHERE id=%s;",(str(len(sc)),str(row[0]))) self.conn.commit()
def sum_cluster(self): self.clusters = self.clustering() lexrank = LexRank(clustering=None) self.summaries = [] for c in self.clusters: lexrank.summarize(c.sen2txt()) for i, s in enumerate(c.sentences): print(i, s.text) if len(c.sentences)<20: self.summaries.append(lexrank.probe(0.3)) else: self.summaries.append(lexrank.probe(0.2)) self.sum_talker(c) # use current cluster to summarize per talker
def getsentence(table): conn = pymysql.connect(host='localhost', user='******', password='******', db='KETI', use_unicode=True, charset='utf8') lexrank = LexRank() curs = conn.cursor() #sql = "SELECT id,raw,sentence_cnt FROM "+ table +" where id>179460;" sql = "select count(*)*0.1 from "+ table +";" curs.execute(sql) rows = curs.fetchone() percentile = int(rows[0]) sql = "SELECT id,raw,sentence_cnt FROM "+ table +" order by sentence_cnt desc;" curs.execute(sql) rows = curs.fetchall() i=0 for row in rows: i = i+1 if i == percentile : percentile_cnt = row[2] break; sql = "SELECT avg(sentence_cnt) FROM "+ table +" where sentence_cnt <="+str(percentile_cnt) +";" curs.execute(sql) rows = curs.fetchone() percentile_avg = int(rows[0]) print i, percentile_cnt, percentile_avg '''
def makeEvents(self, summaries, rows, x_event): lexrank = LexRank() events = [] events_size = 0 for summa in summaries: find = False text = {} summa = str(summa) db_date = "" if (summa.find("기자]") != -1): summa = self.deleteNoise(summa) for row in rows: # When Summa value is none if (row[2] is None): temp = "".join(row[1]) if (temp.find("".join(summa)) != -1): find = True db_raw = str(row[1]) db_summa = str(row[2]) db_title = str(row[3]) db_date = str(row[4]) db_sentenceCnt = str(row[6]) db_link = str(row[7]) if (db_raw.find("기자]") != -1): db_raw = self.deleteNoise(db_raw) if (db_summa.find("기자]") != -1): db_summa = self.deleteNoise(db_summa) if (db_title.find("기자]") != -1): db_title = self.deleteNoise(db_title) text["headline"] = db_title text["text"] = db_raw text["text2"] = db_raw if (db_sentenceCnt > self.summa_count): raw = "".join(row[1]) lexrank.summarize(raw) raw_summaries = lexrank.probe(self.summa_count) text3_summa = "" for raw_summa in raw_summaries: text3_summa += str(raw_summa) + ". " if (text3_summa.find("기자]") != -1): text3_summa = self.deleteNoise(text3_summa) text["text3"] = text3_summa else: text["text3"] = "" break # When Summa value isn't none else: temp = "".join(row[2]) if (temp.find("".join(summa)) != -1): find = True db_raw = str(row[1]) db_summa = str(row[2]) db_title = str(row[3]) db_date = str(row[4]) db_sentenceCnt = str(row[6]) db_link = str(row[7]) if (db_raw.find("기자]") != -1): db_raw = self.deleteNoise(db_raw) if (db_summa.find("기자]") != -1): db_summa = self.deleteNoise(db_summa) if (db_title.find("기자]") != -1): db_title = self.deleteNoise(db_title) text["headline"] = db_title text["text"] = db_raw text["text2"] = db_summa if (db_sentenceCnt > self.summa_count): raw = "".join(row[1]) lexrank.summarize(raw) raw_summaries = lexrank.probe(self.summa_count) text3_summa = "" for raw_summa in raw_summaries: text3_summa += str(raw_summa) + ". " if (text3_summa.find("기자]") != -1): text3_summa = self.deleteNoise(text3_summa) text["text3"] = text3_summa else: text["text3"] = "" break if (find == True): event = { "start_date": { "year": db_date.split('-')[0], "month": db_date.split('-')[1], "day": db_date.split('-')[2], "hour": "", "minute": "", "second": "", "millisecond": "", "format": "" }, "group": db_date.split('-')[1], "media": { "caption": "", "credit": "", "url": db_link, "thumb": "" } } event["text"] = text events.append(event) events_size += 1 if (events_size == x_event): break return events
from __future__ import print_function from lexrankr import LexRank # 이거 아님 lexrank = LexRank() # can init with various settings lexrank.summarize(''' 경찰이 잠실야구장에서 벌어진 '현대판 노예사건'에 대해 본격 수사에 착수했다. 앞서 서울시장애인인권센터는 이곳 분리수거장에서 A씨(60)를 구조하고, 그가 임금을 받지 못한 채 노예처럼 일해온 것으로 추정된다며 수사를 의뢰했다. 서울 송파경찰서는 A씨를 고용한 고물업체 사장 B씨를 최근 불구속 입건하고 인권센터가 제기한 사기·폭행·가혹행위 등의 의혹에 대해 수사 중이라고 20일 밝혔다. 인권센터와 경찰 등에 따르면 B씨는 지난 17년 동안 A씨에게 제대로 된 임금을 주지 않고 분리수거 업무를 시킨 의혹을 받는다. 특히 인권센터는 A씨가 떨어진 빵으로 끼니를 채우면서 하루 16시간 동안 일하는 등 노동착취를 당했다고 주장했다. 지적장애를 가진 A씨는 분리수거장 내부 컨테이너에서 지금껏 생활해온 것으로 알려졌다. 현재 경찰은 인권센터와 언론에서 제기한 의혹에 대해 사실관계를 확인하는 중이다. 피해자와 고물업체 사장, 잠실야구장 시설관리자 등에 대해 한차례씩 조사도 진행했다. 경찰관계자는 "피해자의 계좌 입출금 내역을 분석하고 있다"며 "학대 등의 혐의가 있는지 확인하기 위해 추가로 관계자들을 조사할 계획"이라고 밝혔다. ''') summaries = lexrank.probe( 1) # `num_summaries` can be `None` (using auto-detected topics) for summary in summaries: # 그냥 출력 print(summary) from firebase import firebase firebase = firebase.FirebaseApplication('https://chatbot-c6606.firebaseio.com') result = firebase.post('/', {'news': summary}) print(result)
def simpleLexrank(arg): lexrank = LexRank() lexrank.summarize(arg) summaries = lexrank.probe(3) return (summaries)
def createJSON(self): start_time_createJSON = time.time() if (self.main_event > len(self.main_idx)): sys.exit("main_event must be smaller than main_idx") elif ((self.total_event - self.main_event) > len(self.outlier_idx)): sys.exit("total_event-main_event must be smaller than outlier_idx") lexrank = LexRank() # can init with various settings curs = self.conn.cursor() ############################################################################## # MAKE TITLE # ############################################################################## main_outlier_text = "" main_outlier_idx = self.main_idx + self.outlier_idx main_outlier_idx_size = len(main_outlier_idx) tag = "" main_outlier_sentence_cnt = 0 main_outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in(" for i in range(main_outlier_idx_size): if (i != (main_outlier_idx_size - 1)): main_outlier_sql += str(main_outlier_idx[i]) + "," else: main_outlier_sql += str(main_outlier_idx[i]) + ");" curs.execute(main_outlier_sql) main_outlier_rows = curs.fetchall() for row in main_outlier_rows: tag += "".join(row[5]) main_outlier_sentence_cnt += row[6] if (row[2] is None): main_outlier_text += "".join(row[1]) else: main_outlier_text += "".join(row[2]) #main_outlier_tag=self.selectTag(tag) print( "===============================================================") print( "====================== START CREATE_JSON ======================") print( "===============================================================\n" ) start_time = time.time() lexrank.summarize(main_outlier_text) print("first total : ") print(" %.02f sec" % (time.time() - start_time)) summaries = lexrank.probe(self.total_event) find = 0 db_link = "" main_outlier_summa = "" for summa in summaries: summa = str(summa) if (summa.find("기자]") != -1): summa = self.deleteNoise(summa) for row in main_outlier_rows: if (row[2] is None): temp = "".join(row[1]) if (temp.find("".join(summa)) != -1): db_link = str(row[7]) find += 1 main_outlier_summa += summa + ". " break else: temp = "".join(row[2]) if (temp.find("".join(summa)) != -1): db_link = str(row[7]) find += 1 main_outlier_summa += summa + ". " break if (find == (1)): break main_outlier_tag = self.selectTag(tag, main_outlier_summa) total_data = OrderedDict() total_data["scale"] = "" total_data["title"] = { "media": { "caption": "", "credit": "", "url": db_link, "thumb": "" }, "text": { "headline": main_outlier_tag, "text": main_outlier_summa } } ############################################################################## # MAKE TITLE # END ############################################################################## ############################################################################## # MAKE EVENTS # START ############################################################################## events = [] main_idx_size = len(self.main_idx) main_text = "" outlier_idx_size = len(self.outlier_idx) outlier_text = "" main_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in(" for i in range(main_idx_size): if (i != (main_idx_size - 1)): main_sql += str(self.main_idx[i]) + "," else: main_sql += str(self.main_idx[i]) + ");" curs.execute(main_sql) main_rows = curs.fetchall() for row in main_rows: if (row[2] is None): main_text += "".join(row[1]) else: main_text += "".join(row[2]) start_time = time.time() lexrank.summarize(main_text) start_time = time.time() summaries = self.getDistinctArticleSummaries(lexrank, main_rows, self.main_event) start_time = time.time() events += self.makeEvents(summaries, main_rows, self.main_event) outlier_sql = "SELECT id,raw,summa,title,date,tag,sentence_cnt,link FROM " + self.table + " WHERE id in(" for i in range(outlier_idx_size): if (i != (outlier_idx_size - 1)): outlier_sql += str(self.outlier_idx[i]) + "," else: outlier_sql += str(self.outlier_idx[i]) + ");" curs.execute(outlier_sql) outlier_rows = curs.fetchall() for row in outlier_rows: if (row[2] is None): outlier_text += "".join(row[1]) else: outlier_text += "".join(row[2]) start_time = time.time() lexrank.summarize(outlier_text) summaries = self.getDistinctArticleSummaries( lexrank, outlier_rows, self.total_event - self.main_event) events += self.makeEvents(summaries, outlier_rows, self.total_event - self.main_event) total_data["events"] = events print(" <TOTAL TIME> ") print(" %.02f sec" % (time.time() - start_time_createJSON) + "\n") ############################################################################## # MAKE EVENTS # END ############################################################################## numTag = 3 fileTag = "" for i in range(0, numTag): fileTag += "".join(main_outlier_tag.split(' ')[i]) if (i != numTag - 1): fileTag += "-" file_path = "JSON/" + fileTag + "_" + str(self.total_event) + ".json" with codecs.open(file_path, 'w', encoding="utf-8") as make_file: json.dump(total_data, make_file, ensure_ascii=False, indent=4, sort_keys=False)
def setUp(self): self.text = "사과 배 감 귤. 배 감 귤 수박. 감 귤 수박 딸기. 오이 참외 오징어. 참외 오징어 달팽이." self.lexrank = LexRank(min_keyword_length=0, no_below_word_count=0, min_cluster_size=1)
from __future__ import print_function from lexrankr import LexRank from nltk.tokenize import word_tokenize from nltk.tag import pos_tag import sys filePath = sys.argv[1] probNumber = sys.argv[2] lexrank = LexRank() all_txt = [] with open(filePath, "r", encoding="UTF-8") as f: all_txt = f.readlines() all_string = "" for i in all_txt: all_string += i with open("sentence_set.txt", "r") as f: sent_analy = f.readlines() for i in range(len(sent_analy)): sent_analy[i] = sent_analy[i].split(' ') for j in range(len(sent_analy[i])): if '\n' in sent_analy[i][j]: sent_analy[i][j] = sent_analy[i][j].replace('\n', '') token_all_string = pos_tag(word_tokenize(all_string)) process_str = "" index = 0 while (len(token_all_string)):
from flask import Flask, request, jsonify from flask_restful import Resource, Api from flask_restful import reqparse from lexrankr import LexRank from time import time app = Flask(__name__) api = Api(app) app.config['JSON_AS_ASCII'] = False lexrank = LexRank() # can init with various settings @app.route('/summary', methods=['POST']) def post(): try: start = time() parser = reqparse.RequestParser() parser.add_argument('contents', type=str) args = parser.parse_args() contents = args['contents'] print("==== contents ==== : ", contents) print(contents) # lexrank.summarize(contents) # originalText = "이 업체 물건 절대 사지마세요. 형편없습니다. 직원들이 고객은 안중에도 없습니다. 열 받게 만드는 데 선수입니다. 박지원 직원인지 대표인지 몰라도 열받게 하는 제주가 보통이 넘습니다. 다시 한 번 강조하지만 절대 네버네버 스위트피 사지 마세요. 절대 비추입니다." lex = LexRank() lex.summarize(contents)