def base_vectorize(self, index, link): try: Basesummarizes = [] print(link) textrank = TextRank.TextRank(link) summarizes = textrank.summarize(10) keywords = textrank.keywords() for sentence in summarizes: Basesummarizes.append(sentence) for sentence in textrank.sentences: for word in sentence.split(" "): if word in self.__keyword: Basesummarizes.append(sentence) break flag = 0 for keyword in keywords: if keyword in self.__keyword: flag = 1 break if flag == 0: print("검색어가 키워드에 없습니다.") return self.__validation.sum_str( self.__sentenceTokenizer.get_nouns(Basesummarizes)) self.__validation.set_dic(index, 0) except Exception as e: print(e) print('textrank not working') return self.printCommand(index, link, summarizes, keywords) self.__linkDict[index] = link self.__sentenceDict[index] = summarizes self.__keywordDict[index] = keywords self.__distanceDict = self.__validation.get_dic() self.__observer.resultToGui()
def cal_textrank(window, alpha): # with open('停用词表.txt', 'r', encoding='utf-8') as ban: # banlist = ban.read().splitlines() win = int(window) alpha = float(alpha) with open('./original/corpus1.txt', 'r', encoding='utf-8') as f: s = f.read().replace('\n', '').strip() tr = TextRank(s, win, alpha, 700) tr.cutSentence() tr.createNodes() tr.createMatrix() tr.calPR() tr.output_matrix() res = tr.printResult() textrank = '' for item in res: # if item[0].strip() in banlist: # continue s = str(tr.word_index[item[0]])+','+str(item).replace('(','').replace(')','').replace('\'','')+'\n' textrank+=s with open('./textrank.txt', 'w', encoding='utf-8') as w: w.write(textrank)
def target_vectorize(self, targetIndex, targetLink): try: textrank = TextRank.TextRank(targetLink) summarizes = textrank.summarize(10) keywords = textrank.keywords() flag = 0 for keyword in keywords: if keyword in self.__keyword: flag = 1 break if flag == 0: print("검색어가 키워드에 없습니다.") return self.__validation.target_vectorizing( self.__sentenceTokenizer.get_nouns(summarizes)) distance = self.__validation.dist_norm() if math.isnan(distance) == True: raise ValueError self.__validation.set_dic(targetIndex, distance) except: print('textrank not working') return self.printCommand(targetIndex, targetLink, summarizes, keywords, distance) self.__linkDict[targetIndex] = targetLink self.__sentenceDict[targetIndex] = summarizes self.__keywordDict[targetIndex] = keywords self.__distanceDict = self.__validation.get_dic() self.__observer.resultToGui()
nerInPyltp = loadNerDictFromPyltp('pyltp_savebox.txt') partOfSpeechDict = loadWordsPartOfSpeech("spdict.txt") nerDict = loadPreTrainEntityDict('lexiconAndNerDictWithInfo.txt') # 打开训练数据集 f = codecs.open("coreEntityEmotion_train.txt", 'r', 'utf-8') # 设置输出文件 outputname = "entityOutPut_originCut-pyltp_full_v3" fout = codecs.open(outputname + ".txt", 'w', 'utf-8') fout_cache = codecs.open(outputname + "_datacache.txt", 'w', 'utf-8') #加载TextRank trDemo = TextRank.TextRank() # 分析过程 i = 0 for rawline in f.readlines(): # 按行分析 rawline_json = json.loads(rawline) # 获取标题行 titleline = rawline_json['title'] # 获取实体 entity = set() eec = rawline_json["coreEntityEmotions"] for key in eec: entity.add(key["entity"]) # 获取标题分词 titleWords = segmentor.segment(titleline)
except: print("Input must be a natural number 0-100!") continue if not (compression > 0 and compression < 100): print("Out of bounds, try again") num_of_sentences = int((compression/100) * article_dict["LENGTH"]) if num_of_sentences == 0: print("The desired compression rate for this article resulted in a zero sentence summary. Please try" " again with a higher rate of compression") exit() edmundson = Edmundson(article_dict) rhetoric = ExtractedArticle(article_dict) textrank = TextRank(article_dict["BODY"]) master_scores = list(map(sum, zip(edmundson.get_sent_scores(custom_settings),rhetoric.get_sent_scores(custom_settings),textrank.get_sent_scores()))) preliminary_indices = sorted(range(len(master_scores)), key=lambda i: master_scores[i])[-(num_of_sentences):] master_indices = sorted(preliminary_indices) print("Display Summary: \n") for index in master_indices: print(article_dict["BODY"][index]) summary += article_dict["BODY"][index] summary = summary + "\n\nThis summary was generated using: " + active_pickle_file + "\n" + "Source shrunk from " + str(article_dict['LENGTH']) + ' sentences to ' + str(num_of_sentences) + " sentences" + " (" + str(compression) + "%)" os.chdir(Summarypath)