def main(): # 输入压缩比 ratio = raw_input("Please enter the compressed ratio: ") title, paragraghs = load_data("data/01.txt") sentences = [] # 构建文本的句子顺序 for paragragh in paragraghs: sentences.extend(sentence_cut(paragragh, punctuation_list='!!。')) sentences_with_indices = dict(zip(sentences, range(len(sentences)))) # 抽取关键词,并计算句子的权重 keywords = get_key_words(title, paragraghs) key_sentences = compute_sentences_weigths(keywords, paragraghs) # 根据压缩比,计算需要抽取多少个句子 topK = int(len(key_sentences) * float(ratio)) result_dict = {} for sentence in key_sentences[:topK]: result_dict[sentence] = sentences_with_indices[sentence] # 将抽取出来的句子按原文顺序排好输出 result_dict = sorted(result_dict.iteritems(), key=lambda d: d[1]) result_dict = [result[0] for result in result_dict] summary = ''.join(result_dict) print summary
def get_key_sentences(content_weights, content, p_weight=1.2, s_bias=1, s_weight = 1.2): sentences = sentence_cut(content, punctuation_list='!!。') for i in range(s_bias): content_weights[sentences[i]] = {'weight': 0, 'p_weight': p_weight, 's_weight':s_weight} content_weights[sentences[-i-1]] = {'weight': 0, 'p_weight': p_weight, 's_weight': s_weight} for sentence in sentences[s_bias:-s_bias]: content_weights[sentence] = {'weight': 0, 'p_weight': p_weight, 's_weight':1} return content_weights
def compute_sentences_weigths(keywords, paragraphs, p_bias=1, p_weight=1.2, s_bias=1, s_weight = 1.2): content_weights = {} for i in range(p_bias): content_weights = get_key_sentences(content_weights, paragraphs[i], p_weight=p_weight, s_bias=s_bias, s_weight =s_weight) content_weights = get_key_sentences(content_weights, paragraphs[-i-1], p_weight=p_weight, s_bias=s_bias, s_weight =s_weight) for paragraph in paragraphs[p_bias:-p_bias]: content_weights = get_key_sentences(content_weights, paragraph, p_weight=1, s_bias=s_bias, s_weight =s_weight) for sentence in content_weights.keys(): for word in keywords.keys(): if word in sentence: content_weights[sentence]['weight'] += keywords[word] inner_num = len(sentence_cut(sentence, punctuation_list=',;,::;… ')) content_weights[sentence] = content_weights[sentence]['weight']*\ content_weights[sentence]['p_weight']*content_weights[sentence]['s_weight']/inner_num content_weights = sorted(content_weights.iteritems(), key=lambda d: d[1], reverse=True) content_weights = [list(result)[0] for result in content_weights] return content_weights
def main(): with codecs.open("data/01.txt", 'r', encoding='GBK') as fr: sentences = [] for line in fr.readlines(): line = line.strip() if line: sentences.extend(sentence_cut(line, punctuation_list='。!!')) words, text = get_keywords(sentences, ['ns', 'nr', 'n']) weights = construct_matrix(words) num = len(text) start_tr = np.ones((1, num)) d = 0.85 iters = 100 tr = textrank(start_tr, iters, d, weights).tolist()[0] ratio = float(raw_input("Please enter the compressed ratio: ")) topK = int(num * ratio) summary = summaly(text, tr, topK) print summary
def main(): with codecs.open("data/01.txt", 'r', encoding='GBK') as fr: sentences = [] for line in fr.readlines(): line = line.strip() if line: sentences.extend(sentence_cut(line, punctuation_list='。!!')) words, text = get_keywords(sentences,['ns', 'nr', 'n']) weights = construct_matrix(words) num = len(text) start_tr = np.ones((1,num)) d = 0.85 iters = 100 tr = textrank(start_tr, iters, d, weights).tolist()[0] ratio = float(raw_input("Please enter the compressed ratio: ")) topK = int(num*ratio) summary = summaly(text, tr, topK) print summary