예제 #1
0
    def base_vectorize(self, index, link):
        try:
            Basesummarizes = []
            print(link)
            textrank = TextRank.TextRank(link)

            summarizes = textrank.summarize(10)
            keywords = textrank.keywords()

            for sentence in summarizes:
                Basesummarizes.append(sentence)

            for sentence in textrank.sentences:
                for word in sentence.split(" "):
                    if word in self.__keyword:
                        Basesummarizes.append(sentence)
                        break

            flag = 0
            for keyword in keywords:
                if keyword in self.__keyword:
                    flag = 1
                    break

            if flag == 0:
                print("검색어가 키워드에 없습니다.")
                return

            self.__validation.sum_str(
                self.__sentenceTokenizer.get_nouns(Basesummarizes))
            self.__validation.set_dic(index, 0)
        except Exception as e:
            print(e)
            print('textrank not working')
            return

        self.printCommand(index, link, summarizes, keywords)

        self.__linkDict[index] = link
        self.__sentenceDict[index] = summarizes
        self.__keywordDict[index] = keywords

        self.__distanceDict = self.__validation.get_dic()

        self.__observer.resultToGui()
예제 #2
0
def cal_textrank(window, alpha):
    # with open('停用词表.txt', 'r', encoding='utf-8') as ban:
    #     banlist = ban.read().splitlines()
    win = int(window)
    alpha = float(alpha)
    with open('./original/corpus1.txt', 'r', encoding='utf-8') as f:
        s = f.read().replace('\n', '').strip()
        tr = TextRank(s, win, alpha, 700)
        tr.cutSentence()
        tr.createNodes()
        tr.createMatrix()
        tr.calPR()
        tr.output_matrix()
        res = tr.printResult()
    textrank = ''
    for item in res:
        # if item[0].strip() in banlist:
        #     continue
        s = str(tr.word_index[item[0]])+','+str(item).replace('(','').replace(')','').replace('\'','')+'\n'
        textrank+=s
    with open('./textrank.txt', 'w', encoding='utf-8') as w:
        w.write(textrank)
예제 #3
0
    def target_vectorize(self, targetIndex, targetLink):
        try:
            textrank = TextRank.TextRank(targetLink)
            summarizes = textrank.summarize(10)
            keywords = textrank.keywords()

            flag = 0
            for keyword in keywords:
                if keyword in self.__keyword:
                    flag = 1
                    break

            if flag == 0:
                print("검색어가 키워드에 없습니다.")
                return

            self.__validation.target_vectorizing(
                self.__sentenceTokenizer.get_nouns(summarizes))

            distance = self.__validation.dist_norm()

            if math.isnan(distance) == True:
                raise ValueError

            self.__validation.set_dic(targetIndex, distance)
        except:
            print('textrank not working')
            return

        self.printCommand(targetIndex, targetLink, summarizes, keywords,
                          distance)

        self.__linkDict[targetIndex] = targetLink
        self.__sentenceDict[targetIndex] = summarizes
        self.__keywordDict[targetIndex] = keywords
        self.__distanceDict = self.__validation.get_dic()

        self.__observer.resultToGui()
예제 #4
0
nerInPyltp = loadNerDictFromPyltp('pyltp_savebox.txt')

partOfSpeechDict = loadWordsPartOfSpeech("spdict.txt")
nerDict = loadPreTrainEntityDict('lexiconAndNerDictWithInfo.txt')

# 打开训练数据集
f = codecs.open("coreEntityEmotion_train.txt", 'r', 'utf-8')

# 设置输出文件
outputname = "entityOutPut_originCut-pyltp_full_v3"
fout = codecs.open(outputname + ".txt", 'w', 'utf-8')
fout_cache = codecs.open(outputname + "_datacache.txt", 'w', 'utf-8')

#加载TextRank
trDemo = TextRank.TextRank()

# 分析过程
i = 0
for rawline in f.readlines():
    # 按行分析
    rawline_json = json.loads(rawline)
    # 获取标题行
    titleline = rawline_json['title']
    # 获取实体
    entity = set()
    eec = rawline_json["coreEntityEmotions"]
    for key in eec:
        entity.add(key["entity"])
    # 获取标题分词
    titleWords = segmentor.segment(titleline)
    except:
        print("Input must be a natural number 0-100!")
        continue

    if not (compression > 0 and compression < 100):
        print("Out of bounds, try again")

num_of_sentences = int((compression/100) * article_dict["LENGTH"])
if num_of_sentences == 0:
    print("The desired compression rate for this article resulted in a zero sentence summary. Please try"
          " again with a higher rate of compression")
    exit()

edmundson = Edmundson(article_dict)
rhetoric = ExtractedArticle(article_dict)
textrank = TextRank(article_dict["BODY"])

master_scores =  list(map(sum, zip(edmundson.get_sent_scores(custom_settings),rhetoric.get_sent_scores(custom_settings),textrank.get_sent_scores())))
preliminary_indices = sorted(range(len(master_scores)), key=lambda i: master_scores[i])[-(num_of_sentences):]
master_indices = sorted(preliminary_indices)


print("Display Summary: \n")

for index in master_indices:
    print(article_dict["BODY"][index])
    summary += article_dict["BODY"][index]

summary = summary + "\n\nThis summary was generated using: " + active_pickle_file + "\n" + "Source shrunk from " + str(article_dict['LENGTH']) + ' sentences to ' + str(num_of_sentences) + " sentences" + " (" + str(compression) + "%)"

os.chdir(Summarypath)