def search_news(): # 获取查询字段 query_line = request.args.get("query") # 返回url data = TextRank.Get_sample_news(query_line, url_list, dictionary, tfidf_vectors) return jsonify(data)
def main(url, log): log1 = log while True: if not os.path.exists(log1): os.makedirs(log1) break else : log1 = log1+'_' content = content_extractor.get_content(url) logFile = open(log1+'/metadata',"a") logFile.write("URL : "+url+"\n\n") logFile.write("Title : "+content['title']+"\n\n") logFile.write("Meta Desc. : "+content['meta']+"\n\n") logFile.write("Content : "+content['content'].encode("utf8")+"\n\n") logFile.close() d = TextRank.text_rank(content['content']) sortd = sorted(d.iteritems(), key = operator.itemgetter(1), reverse=True) logtext = open(log1+'/textrank_result',"a") logtext.write(str(sortd)) logtext.close() final=[] for i in sortd: for j in range(len(source_probs)): final.append((i[0], j, i[1]*source_probs[j])) fsort = sorted(final, key = operator.itemgetter(2), reverse=True) logres = open(log1+'/result',"a") logres.write(str(fsort)) logres.close() server.run_server(fsort[:10])
def get_keyphrases_textrank(): text = request.form['textrank_text'] top_n = request.form['top_n_textrank'] top_keywords = TextRank.extractKeyphrases(text, int(top_n)) context = dict() context['keywords'] = top_keywords return render_template("keyword_textrank.html", **context)
def init(self): self.__linkDict = dict() self.__sentenceDict = dict() self.__keywordDict = dict() self.__distanceDict = dict() self.__validation = Validation.Validation() self.__validation.init_dic() self.__validation.init_base_normalized() self.__sentenceTokenizer = TextRank.SentenceTokenizer()
def get_keyphrases_textrank(): filename = request.form['textrank_name'] top_n = request.form['top_n_textrank'] surveys = pd.read_excel(filename, header=0) col_name = request.form['textrank_question'] text = "" col = surveys[col_name] for i in range(len(col)): text = text + " " + col[i] top_keywords = TextRank.extractKeyphrases(text, int(top_n)) context = dict() context['keywords'] = top_keywords return render_template("keyword_textrank.html", **context)
def base_vectorize(self, index, link): try: Basesummarizes = [] print(link) textrank = TextRank.TextRank(link) summarizes = textrank.summarize(10) keywords = textrank.keywords() for sentence in summarizes: Basesummarizes.append(sentence) for sentence in textrank.sentences: for word in sentence.split(" "): if word in self.__keyword: Basesummarizes.append(sentence) break flag = 0 for keyword in keywords: if keyword in self.__keyword: flag = 1 break if flag == 0: print("검색어가 키워드에 없습니다.") return self.__validation.sum_str( self.__sentenceTokenizer.get_nouns(Basesummarizes)) self.__validation.set_dic(index, 0) except Exception as e: print(e) print('textrank not working') return self.printCommand(index, link, summarizes, keywords) self.__linkDict[index] = link self.__sentenceDict[index] = summarizes self.__keywordDict[index] = keywords self.__distanceDict = self.__validation.get_dic() self.__observer.resultToGui()
def summarymain(domain, prodID, choice, ch_token, token=4): summary = "" if choice == 1: rankedText = TextRank.summaryGen(prodID, domain, debugging=True) if choice == 2: if ch_token == "y" or ch_token == "yes": rankedText = TFIDFSummary.summaryGen(prodID, domain, gram=token, debug=True) else: rankedText = TFIDFSummary.summaryGen(prodID, domain, debug=True) return rankedText '''keys=keywords.extract_keywords(domain,prodslist[ch])
def textRank(inpath,outpath): string = codecs.open(inpath, 'r', 'utf-8',errors='ignore').read() textrank_results = TextRank.extractKeyphrases(string) sorted_keywords = sorted(textrank_results.items(), key=lambda x: x[1], reverse=True) print(sorted_keywords) outString='' for i in range(len(sorted_keywords)): print(sorted_keywords[i]) print(sorted_keywords[i][0]) print(sorted_keywords[i][1]) outString+=sorted_keywords[i][0] outString+=':' outString+=str(sorted_keywords[i][1]) outString+='\n' # out_path='output/5AbstractsGroup-test1/Business/0401.txt' with open(outpath,'w',encoding='utf-8') as f: f.write(outString)
def ezLaunchDoc(selection): """Uses the Docs API. Makes a new document with a summery from POST request. """ creds = None # The file token.pickle stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. if os.path.exists(os.path.join(THIS_FOLDER, 'token.pickle')): with open(os.path.join(THIS_FOLDER, 'token.pickle'), 'rb') as token: creds = pickle.load(token) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( os.path.join(THIS_FOLDER, 'credentials.txt'), SCOPES) creds = flow.run_local_server() # Save the credentials for the next run with open(os.path.join(THIS_FOLDER, 'token.pickle'), 'wb') as token: pickle.dump(creds, token) service = build('docs', 'v1', credentials=creds) # Retrieve the documents contents from the Docs service. # document = service.dtitle = 'My Document' summs = eZ.ezRank(selection) body = { 'title' : "New Summary" } requests = [ { 'insertText':{ 'location':{ 'index':1, }, 'text':"\n" + summs } } ] document = service.documents().create(body=body).execute() result = service.documents().batchUpdate(documentId=document.get("documentId"), body={'requests':requests}).execute()
def run(self): # 读取语料库,运行摘要程序 doc, index = dataset.read_sogou(1) # debug module # doc, index = dataset.read_test() summary = tr.summary(doc, index) # delete the text self.text_answer.delete(1.0, tk.END) self.text_summary.delete(1.0, tk.END) self.result_text.delete(1.0, tk.END) # fill the text with the doc, summary self.fill_doc(doc) self.fill_summary(summary) self.window.update() # show the result self.write_file_name = None self.write_flag = False return
def textrank(init_prob, strings, log): """ Returns the updated priorities based on initial priorities ('init_probs') and list of strings ('strings'). Uses text rank to do so. 'log' is the name of file in which log should be saved. """ d = {} for j in range(len(strings)): #Merging the outputs of text rank applied on each string in strings. d1 = TextRank.text_rank(strings[j]) for i in d1.keys(): temp = i.lower() if d.has_key(temp) : d[temp][j] = d[temp][j] + d1[i] else : d[temp] = np.zeros(len(strings)) d[temp][j] = d1[i] source_probs = update(init_prob, dict2term_doc_matx(d), 0, log+"_update_results") sorted_dict = sorted(d.iteritems(), key = lambda x: x[1].dot(np.array(source_probs)), reverse=True) logFile = open(log+'_sorted_dict','a') print >>logFile, sorted_dict logFile.close() return source_probs
def target_vectorize(self, targetIndex, targetLink): try: textrank = TextRank.TextRank(targetLink) summarizes = textrank.summarize(10) keywords = textrank.keywords() flag = 0 for keyword in keywords: if keyword in self.__keyword: flag = 1 break if flag == 0: print("검색어가 키워드에 없습니다.") return self.__validation.target_vectorizing( self.__sentenceTokenizer.get_nouns(summarizes)) distance = self.__validation.dist_norm() if math.isnan(distance) == True: raise ValueError self.__validation.set_dic(targetIndex, distance) except: print('textrank not working') return self.printCommand(targetIndex, targetLink, summarizes, keywords, distance) self.__linkDict[targetIndex] = targetLink self.__sentenceDict[targetIndex] = summarizes self.__keywordDict[targetIndex] = keywords self.__distanceDict = self.__validation.get_dic() self.__observer.resultToGui()
def test_summary_gen(self): self.assertEqual(TextRank.summaryGen("abc.txt", "abc"), "Domain not in dataset")
def summarize_with_TextRank(sentences, matrix): return TextRank.extractSentencesFromSentenceTokens(sentences)
def main(dom_choice, domain_list): if (dom_choice > len(domain_list)): print "Wrong choice" return "Wrong choice" domain = domain_list[dom_choice - 1] f = open("../datasets/Brands/" + domain.lower() + ".pickle", 'rb') object_file = pickle.load(f) prodslist = {} c = 0 brandslist = {} prodslist = {} for brand in object_file.keys(): #brand.append(line.split('|')[0]) brandslist[c + 1] = brand print str(c + 1) + ". " + brand + "\n" c += 1 print "Enter your choice" ch = int(raw_input()) #ch=ch-1 selectedBrand = brandslist[ch] print selectedBrand c = 0 for prods in range(len(object_file[selectedBrand])): for prod in object_file[selectedBrand][prods].keys(): prodslist[c + 1] = object_file[selectedBrand][prods][prod] print str(c + 1) + ". " + prod + "\n" c += 1 print "Enter your choice" ch = int(raw_input()) #ch=ch-1 print "1.Summary using Text Rank" print "2.Summary using TF-IDF" print "Enter your choice" choice = int(raw_input()) summary = "" if choice == 1: print "Do you want to enable debugging (Y/N)?" ch_debug = raw_input().lower() if ch_debug == "y" or ch_debug == "yes": rankedText = TextRank.summaryGen(prodslist[ch], domain, debugging=True) else: rankedText = TextRank.summaryGen(prodslist[ch], domain) f.close() sleep(3) #rankedText=rankedText[:len(rankedText)/3] if choice == 2: print "Do you want to enable debugging (Y/N)?" ch_debug = raw_input().lower() print "Do you want to enter the token size (Y/N)?" ch_token = raw_input().lower() if ch_debug == "y" or ch_debug == "yes": if ch_token == "y" or ch_token == "yes": print "Enter token size" token = int(raw_input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token, debug=True) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, debug=True) else: if ch_token == "y" or ch_token == "yes": print "Enter token size" token = int(raw_input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain) keys = keywords.extract_keywords(domain, prodslist[ch]) rankedSummary = "" for i in range(len(rankedText)): rankedSummary += rankedText[i] stopwords = load_stop_words("../stoplist.txt") tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE) tokens = tokenizer.tokenize(rankedSummary) tokens = [token for token in tokens if token.lower() not in stopwords] precision = float(len(set(tokens).intersection(set(keys)))) / float( len(tokens)) recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys)) fmeasure = 2 * (precision * recall) / (precision + recall) print "\n\n" print "Precision =", precision print "Recall =", recall print "F-Measure =", fmeasure
""" author: Tongxin Wong, Jie Liu create time: 2020-07-19 update time: 2020-07-24 """ from flask import Flask, request, jsonify import web import TextRank import opinion_perception app = Flask(__name__) # 加载字典 # 加载模型 dictionary, tfidf_vectors = TextRank.load_source() # 加载url列表 url_list = TextRank.get_url_list() @app.route('/api/hot_news', methods=['POST']) def hot_news(): received_data = request.get_json() tag_values = received_data['tag_values'] data = web.get_hot_news(tag_values) return jsonify(data) @app.route('/api/news_content', methods=['POST']) def news_content(): received_data = request.get_json()
except: print("Input must be a natural number 0-100!") continue if not (compression > 0 and compression < 100): print("Out of bounds, try again") num_of_sentences = int((compression/100) * article_dict["LENGTH"]) if num_of_sentences == 0: print("The desired compression rate for this article resulted in a zero sentence summary. Please try" " again with a higher rate of compression") exit() edmundson = Edmundson(article_dict) rhetoric = ExtractedArticle(article_dict) textrank = TextRank(article_dict["BODY"]) master_scores = list(map(sum, zip(edmundson.get_sent_scores(custom_settings),rhetoric.get_sent_scores(custom_settings),textrank.get_sent_scores()))) preliminary_indices = sorted(range(len(master_scores)), key=lambda i: master_scores[i])[-(num_of_sentences):] master_indices = sorted(preliminary_indices) print("Display Summary: \n") for index in master_indices: print(article_dict["BODY"][index]) summary += article_dict["BODY"][index] summary = summary + "\n\nThis summary was generated using: " + active_pickle_file + "\n" + "Source shrunk from " + str(article_dict['LENGTH']) + ' sentences to ' + str(num_of_sentences) + " sentences" + " (" + str(compression) + "%)" os.chdir(Summarypath)
nerInPyltp = loadNerDictFromPyltp('pyltp_savebox.txt') partOfSpeechDict = loadWordsPartOfSpeech("spdict.txt") nerDict = loadPreTrainEntityDict('lexiconAndNerDictWithInfo.txt') # 打开训练数据集 f = codecs.open("coreEntityEmotion_train.txt", 'r', 'utf-8') # 设置输出文件 outputname = "entityOutPut_originCut-pyltp_full_v3" fout = codecs.open(outputname + ".txt", 'w', 'utf-8') fout_cache = codecs.open(outputname + "_datacache.txt", 'w', 'utf-8') #加载TextRank trDemo = TextRank.TextRank() # 分析过程 i = 0 for rawline in f.readlines(): # 按行分析 rawline_json = json.loads(rawline) # 获取标题行 titleline = rawline_json['title'] # 获取实体 entity = set() eec = rawline_json["coreEntityEmotions"] for key in eec: entity.add(key["entity"]) # 获取标题分词 titleWords = segmentor.segment(titleline)
def runTest(self): print "\nRunning TextRank Summary test cases...\n" self.assertEqual(TextRank.summaryGen("abc.txt", "abc"), "Domain not in dataset")
__author__ = 'nikhil' import tfidf import TextRank, utilities_tests ltf = utilities_tests.test_corpus_probs_update(["test_corpus_file_1.txt", "test_corpus_file_2.txt","test_corpus_file_3.txt"], mode_of_operation = 2, return_term=0) atf = utilities_tests.test_corpus_probs_update(["test_corpus_file_1.txt", "test_corpus_file_2.txt","test_corpus_file_3.txt"], mode_of_operation = 3, return_term=0) text = open("test_corpus_file_1.txt").read() tr_list = TextRank.text_rank(text) print ltf print atf print tr_list #---- Imported TextRank module ----# # #print(tr_list) #for k,g in tr_list: # print k,g
reviewNo += 1 """ Notes on using Gensim: - reviews are already pretty short. Stripping stopwords can reduce sentences to too short a corpus to discern (and train) the topic. That's why many generated topics are not making sense. - ideally, a corpus (sentence or more) should contain several words. """ # if not TEXTRANK: # import gensim # import hw2module as LDA # # run preprocess(), which takes a list of words (sentence) and removes # # all punctuation and stopwords from each word, returning the same structure. # preprocessed_sentences_raw = [LDA.preprocess(s) for s in sentences] # # create a gensim dictionary, save it to file # gdict = LDA.saveInitialDictionary(preprocessed_sentences_raw) # # experiment with number of topics # LDA.make_and_show_lda_model(sentences, gdict, 15, show_docs = True) import TextRank as tr for asin, reviewlist in reviews.items(): print("********* " + asin + " **********") for scoring in tr.score_keyphrases_by_textrank(' '.join(reviewlist), n_keywords=0.25): print(scoring) print()
def cal_textrank(window, alpha): # with open('停用词表.txt', 'r', encoding='utf-8') as ban: # banlist = ban.read().splitlines() win = int(window) alpha = float(alpha) with open('./original/corpus1.txt', 'r', encoding='utf-8') as f: s = f.read().replace('\n', '').strip() tr = TextRank(s, win, alpha, 700) tr.cutSentence() tr.createNodes() tr.createMatrix() tr.calPR() tr.output_matrix() res = tr.printResult() textrank = '' for item in res: # if item[0].strip() in banlist: # continue s = str(tr.word_index[item[0]])+','+str(item).replace('(','').replace(')','').replace('\'','')+'\n' textrank+=s with open('./textrank.txt', 'w', encoding='utf-8') as w: w.write(textrank)
#ch=ch-1 ======= #ch=ch-1 >>>>>>> 9391fcbd2c85c5808aaea53b20cf39fecc5707fa print "1.Summary using Text Rank" print "2.Summary using TF-IDF" print "Enter your choice" choice=int(raw_input()) summary="" if choice==1: print "Do you want to enable debugging (Y/N)?" ch_debug=raw_input().lower() if ch_debug=="y" or ch_debug=="yes": rankedText = TextRank.summaryGen(prodslist[ch],domain,debugging=True) else: <<<<<<< HEAD rankedText = TextRank.summaryGen(prodslist[ch],domain) ======= rankedText = TextRank.summaryGen(prodslist[ch],domain) >>>>>>> 9391fcbd2c85c5808aaea53b20cf39fecc5707fa f.close() sleep(3) #rankedText=rankedText[:len(rankedText)/3] if choice==2:
def main(dom_choice, domain_list): if (dom_choice > len(domain_list)): print("Wrong choice") return "Wrong choice" domain = domain_list[dom_choice - 1] object_file = load(BRANDS_PARSED_PATH + '/' + domain.lower() + ".npz", allow_pickle=True) object_file = object_file['arr_0'].tolist() print(object_file) prodslist = {} c = 0 brandslist = {} prodslist = {} for brand in object_file.keys(): brandslist[c + 1] = brand print(str(c + 1) + ". " + brand + "\n") c += 1 print("Enter your choice") ch = int(input()) selectedBrand = brandslist[ch] print(selectedBrand) c = 0 for prods in range(len(object_file[selectedBrand])): for prod in object_file[selectedBrand][prods].keys(): prodslist[c + 1] = object_file[selectedBrand][prods][prod] print(str(c + 1) + ". " + prod + "\n") c += 1 print("Enter your choice") ch = int(input()) print("1.Summary using Text Rank") print("2.Summary using TF-IDF") print("Enter your choice") choice = int(input()) summary = "" if choice == 1: print("Do you want to enable debugging (Y/N)?") ch_debug = input().lower() if ch_debug == "y" or ch_debug == "yes": rankedText = TextRank.summaryGen(prodslist[ch], domain, debugging=True) else: rankedText = TextRank.summaryGen(prodslist[ch], domain) sleep(3) if choice == 2: print("Do you want to enable debugging (Y/N)?") ch_debug = input().lower() print("Do you want to enter the token size (Y/N)?") ch_token = input().lower() if ch_debug == "y" or ch_debug == "yes": if ch_token == "y" or ch_token == "yes": print("Enter token size") token = int(input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token, debug=True) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, debug=True) else: if ch_token == "y" or ch_token == "yes": print("Enter token size") token = int(input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain) keys = keywords.extract_keywords(domain, prodslist[ch]) rankedSummary = "" for i in range(len(rankedText)): rankedSummary += rankedText[i] stopwords = load_stop_words("../stoplist.txt") tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE) tokens = tokenizer.tokenize(rankedSummary) tokens = [token for token in tokens if token.lower() not in stopwords] precision = float(len(set(tokens).intersection(set(keys)))) / float( len(tokens)) recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys)) fmeasure = 2 * (precision * recall) / (precision + recall) print("\n\n") print("Precision =", precision) print("Recall =", recall) print("F-Measure =", fmeasure)
if not sentence_raw: break category_raw = inf.readline()[:-1] if category_raw == "\n": category_raw = None # indicates that no category was assigned. else: categorized_sentences[category_raw] += " " + sentence_raw sentences_tagged.append((sentence_raw, category_raw)) # list of all sentences that have assigned categories. These sentences are # converted into lists, split by whitespace. split_sentences_raw = [x[0].split() for x in sentences_tagged if x[1] is not None] if not TEXTRANK: # run preprocess(), which takes a list of words (sentence) and removes # all punctuation and stopwords from each word, returning the same structure. preprocessed_sentences_raw = [LDA.preprocess(s) for s in split_sentences_raw] # create a gensim dictionary, save it to file gdict = LDA.saveInitialDictionary(preprocessed_sentences_raw) # experiment with number of topics LDA.make_and_show_lda_model(preprocessed_sentences_raw, gdict, 20, show_docs = True) else: import TextRank as tr for category, conjoined_sentences in categorized_sentences.items(): print("********* " + category + " *********") for scoring in tr.score_keyphrases_by_textrank(conjoined_sentences): print(scoring) print()