def lda_test(x, n_topics, n_top_sent, sent_length, num_best, vocab, sentences): model = lda.LDA(n_topics=n_topics, n_iter=800, random_state=1) model.fit_transform(x) topic_word = model.topic_word_ n_top_words = 5 final1 = '' final_out = '' for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) n_top_sent = n_top_sent s = [] for i, doc_dist in enumerate(model.doc_topic_.transpose()): s1 = [] topic_words = np.array(sentences)[np.argsort(doc_dist)][:-n_top_words:-1] print('{}: {}'.format(i, ' '.join(topic_words))) # ff = f1_score(x, topic_words) # print(ff) t = textblob.TextBlob(' '.join(topic_words)) for sent in t.sentences: t = parse_sentence(sent) s1.append(t) s.append(s1) for s1 in s: print(s1) print("------") compresser = takahe.word_graph(s1, nb_words=sent_length, lang='en', punct_tag="PUNCT") # Get the 50 best paths candidates = compresser.get_compression(num_best) # 1. Rerank compressions by path length (Filippova's method) maxim1 = 0 best_cand1 = '' for cummulative_score, path in candidates: if cummulative_score > maxim1: maxim1 = cummulative_score best_cand1 = ' '.join([u[0] for u in path]) # Normalize path score by path length normalized_score = cummulative_score / len(path) print('Best: ') print(best_cand1) final1 += best_cand1 # Write the word graph in the dot format compresser.write_dot('test.dot') # 2. Rerank compressions by keyphrases (Boudin and Morin's method) reranker = takahe.keyphrase_reranker(s1, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() # Loop over the best reranked candidates maxim = 0 best_cand = '' for score, path in reranked_candidates: # Print the best reranked candidates if score > maxim: maxim = score best_cand = ' '.join([u[0] for u in path]) print('Best: ') print(best_cand) final_out += best_cand print(final1) print(final_out) return final1
def keyphrases_based_msc(sentences, output_sent_num = 50): """ 经过keyphrases重排序后的多语句压缩 :param sentences: :param output_sent_num: :return: """ # 构建词图,并执行压缩 # 忽略词数小于8的句子 compresser = takahe.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT") # 获取压缩结果 candidates = compresser.get_compression(output_sent_num) # 利用keyphrases对压缩结果重新打分 reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() results = [] for score, path in reranked_candidates: results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n') return results
def protogenesis_msc(sentences, output_sent_num = 50): """ 原生多语句压缩 :param sentences: 待压缩的输入语句集合 :param output_sent_num: 输出语句的个数,默认50句 :return: 分数#句子 """ # 构建词图,并执行压缩 # 忽略词数小于8的句子 compresser = takahe.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT") # 获取压缩结果 candidates = compresser.get_compression(output_sent_num) # 对压缩结果进行归一化 tmp = [] for score, path in candidates: tmp.append((score / len(path), path)) # 按照得分排序 tmp = sorted(tmp, key = lambda tmp : tmp[0]) # 封装结果返回 results = [] for score, path in tmp: results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n') return results
def _dofuse(cluster): """ Extracts the call to takahe to interrupt it if it's taking too long. """ fuser = takahe.word_graph(cluster, nb_words=6, lang="en", punct_tag="PUNCT") # get fusions fusions = fuser.get_compression(50) # rerank and keep top 10 reranker = takahe.keyphrase_reranker(cluster, fusions, lang="en") rerankedfusions = reranker.rerank_nbest_compressions()[0:10] return rerankedfusions
def _dofuse(sentenceL): """ Extracts the call to takahe to interrupt it if it's taking too long. """ fuser = takahe.word_graph(sentenceL, nb_words=6, lang="en", punct_tag="PUNCT") # get fusions fusions = fuser.get_compression(50) # rerank and keep top 10 reranker = takahe.keyphrase_reranker(sentenceL, fusions, lang="en") rerankedfusions = reranker.rerank_nbest_compressions()[0:5] return rerankedfusions
def get_compressed_sen(sentences, nb_words): compresser = takahe.word_graph(sentences, nb_words = nb_words, lang = 'en', punct_tag = "." ) candidates = compresser.get_compression(3) # print("--------------------Top 3 candicate---------------", candidates) reranker = takahe.keyphrase_reranker(sentences, candidates, lang = 'en') # print("reranker: ", reranker) # print("finish initialising reranker------------") reranked_candidates = reranker.rerank_nbest_compressions() # print(reranked_candidates) if len(reranked_candidates)>0: score, path = reranked_candidates[0] result = ' '.join([u[0] for u in path]) else: result=' ' # print("----------------selected candicate as final output-------------- ", result) return result
def worker(system_name, param): param_id = param['index'] remove_stopwords = param['remove_stopwords'] pos_filtering = param['pos_filtering'] stemming = param['stemming'] cr_w = param['cr_w'] cr_weighted = param['cr_weighted'] cr_overspanning = param['cr_overspanning'] nb_words = param['nb_words'] diversity_n_clusters = param['diversity_n_clusters'] # ########################## # ### LOOP OVER MEETINGS ### # ########################## for meeting_id in ids: # print "\t\tmeeting_id:", meeting_id # ############################# # ### IDFS (meeting level) ### # ############################# # consider community as DOCUMENT, meeting as CORPUS # idf is based on lower_case form tokenized_document_list = [] for tagged_community in tagged_corpus[meeting_id]: tagged_document = ' '.join(tagged_community) cleaned_tagged_document = utils.clean_tagged_text( tagged_document, stopwords, remove_stopwords=remove_stopwords, pos_filtering=pos_filtering, stemming=stemming, lower_case=True, pos_separator=pos_separator, punct_tag=punct_tag) cleaned_document = utils.remove_tags_from_text( cleaned_tagged_document) tokenized_document_list.append(cleaned_document.split(' ')) meeting_idf_dict = tf_idf.inverse_document_frequencies( tokenized_document_list) # ############################# # ### LOOP OVER COMMUNITIES ### # ############################# meeting_summary = [] for tagged_community in tagged_corpus[meeting_id]: # print "\t\t\ttagged_community_id:", tagged_corpus[meeting_id].index(tagged_community) compresser = takahe.word_graph( system_name=system_name, tagged_community=copy.copy(tagged_community), language=language, punct_tag=punct_tag, pos_separator=pos_separator, lm=lm, wv=wv, stopwords=stopwords, meeting_idf_dict=meeting_idf_dict, remove_stopwords=remove_stopwords, pos_filtering=pos_filtering, stemming=stemming, cr_w=cr_w, cr_weighted=cr_weighted, cr_overspanning=cr_overspanning, nb_words=nb_words, diversity_n_clusters=diversity_n_clusters, keyphrase_reranker_window_size=0, common_hyp_threshold_verb=0.9, common_hyp_threshold_nonverb=0.3) # Write the word graph in the dot format # compresser.write_dot('new.dot') loose_verb_constraint = False while True: # Get the 200 best paths candidates = compresser.get_compression( nb_candidates=200, loose_verb_constraint=loose_verb_constraint) if len(candidates) > 0: final_paths = compresser.final_score(candidates, 1) # n_results meeting_summary.append(final_paths[0][1]) break # Then reason of no candidate: # 1. minimum number of words allowed in the compression larger than # the maximum path length in graph, then decrease nb_words and diversity_n_clusters else: compresser.nb_words -= 1 if compresser.nb_words == 0: # 2. path should contain at least one verb, but no verb presented in the community # in this case, then loose the verb constraint loose_verb_constraint = True # raise RuntimeError("MSC failed") # ###################### # ### OUTPUT SUMMARY ### # ###################### output_path = path_to_root + 'results/' + domain + '/' + dataset_id + '_' + str( corpus_id ) + '/' + development_or_test + '/' + system_name + '/' + str( param_id) + '/' if not os.path.exists(output_path): os.makedirs(output_path) output = '\n'.join(meeting_summary) # output = ''.join([l for l in output if l not in string.punctuation]) output = re.sub(' +', ' ', output).strip() # .lower() # write full summary file_path_name = output_path + meeting_id + '_' + system_name + '.txt' f = open(file_path_name, 'w') f.write(output) f.close() # # write trucated summaries with different sizes # for summary_size in range(50, 550, 50): # file_path_name = output_path + meeting_id + '_' + system_name + '-' + str(summary_size) + '.txt' # f = open(file_path_name, 'w') # cut = ' '.join(output.split(' ')[:summary_size]).replace(' \n', '\n') # f.write(cut) # f.close() print '\t' + system_name, param_id
def compression(communities, stopwords, wv, lm, config, language): param = config['MSC'] pos_separator = '/' punct_tag = 'PUNCT' # ############################# # ### IDFS (meeting level) ### # ############################# # consider community as DOCUMENT, meeting as CORPUS # idf is based on lower_case form tokenized_document_list = [] for tagged_community in communities: tagged_document = ' '.join(tagged_community) cleaned_tagged_document = utils.clean_tagged_text( tagged_document, stopwords, remove_stopwords=param.getboolean('remove_stopwords'), pos_filtering=param.getboolean('pos_filtering'), stemming=param.getboolean('stemming'), lower_case=True, pos_separator=pos_separator, punct_tag=punct_tag ) cleaned_document = utils.remove_tags_from_text(cleaned_tagged_document) tokenized_document_list.append(cleaned_document.split(' ')) meeting_idf_dict = utils.inverse_document_frequencies(tokenized_document_list) # ############################# # ### LOOP OVER COMMUNITIES ### # ############################# compressions = [] graphs = [] for tagged_community in communities: # print "\t\t\ttagged_community_id:", tagged_corpus[meeting_id].index(tagged_community) compresser = takahe.word_graph( system_name=param.get('system_name'), tagged_community=copy.copy(tagged_community), language=language, punct_tag=punct_tag, pos_separator=pos_separator, lm=lm, wv=wv, stopwords=stopwords, meeting_idf_dict=meeting_idf_dict, remove_stopwords=param.getboolean('remove_stopwords'), pos_filtering=param.getboolean('pos_filtering'), stemming=param.getboolean('stemming'), cr_w=param.getint('w'), cr_weighted=param.getboolean('weighted'), cr_overspanning=param.getboolean('overspanning'), nb_words=param.getint('nb_words'), diversity_n_clusters=param.getint('diversity_n_clusters'), keyphrase_reranker_window_size=0, common_hyp_threshold_verb=0.9, common_hyp_threshold_nonverb=0.3 ) # Write the word graph in the dot format # compresser.write_dot('new.dot') loose_verb_constraint = False while True: # Get the 200 best paths candidates = compresser.get_compression(nb_candidates=200, loose_verb_constraint=loose_verb_constraint) if len(candidates) > 0: final_paths = compresser.final_score(candidates, 1) # n_results compressions.append(final_paths[0][1]) graphs.append({ 'nodes': compresser.graph.nodes(), 'edges': compresser.graph.edges() }) break # Then reason of no candidate: # 1. minimum number of words allowed in the compression larger than # the maximum path length in graph, then decrease nb_words and diversity_n_clusters else: compresser.nb_words -= 1 if compresser.nb_words == 0: # 2. path should contain at least one verb, but no verb presented in the community # in this case, then loose the verb constraint loose_verb_constraint = True # raise RuntimeError("MSC failed") return compressions, graphs
################################################################################ sentences = ["The/DT wife/NN of/IN a/DT former/JJ U.S./NNP president/NN \ Bill/NNP Clinton/NNP Hillary/NNP Clinton/NNP visited/VBD China/NNP last/JJ \ Monday/NNP ./PUNCT", "Hillary/NNP Clinton/NNP wanted/VBD to/TO visit/VB China/NNP \ last/JJ month/NN but/CC postponed/VBD her/PRP$ plans/NNS till/IN Monday/NNP \ last/JJ week/NN ./PUNCT", "Hillary/NNP Clinton/NNP paid/VBD a/DT visit/NN to/TO \ the/DT People/NNP Republic/NNP of/IN China/NNP on/IN Monday/NNP ./PUNCT", "Last/JJ week/NN the/DT Secretary/NNP of/IN State/NNP Ms./NNP Clinton/NNP \ visited/VBD Chinese/JJ officials/NNS ./PUNCT"] ################################################################################ # Create a word graph from the set of sentences with parameters : # - minimal number of words in the compression : 6 # - language of the input sentences : en (english) # - POS tag for punctuation marks : PUNCT compresser = takahe.word_graph(sentences, nb_words=6, lang='en', punct_tag="PUNCT") # Get the 50 best paths candidates = compresser.get_compression(50) # 1. Rerank compressions by path length (Filippova's method) for cummulative_score, path in candidates: # Normalize path score by path length normalized_score = cummulative_score / len(path) # Print normalized score and compression print round(normalized_score, 3), ' '.join([u[0] for u in path]) # Write the word graph in the dot format compresser.write_dot('test.dot')
sentences = ["The/DT wife/NN of/IN a/DT former/JJ U.S./NNP president/NN \ Bill/NNP Clinton/NNP Hillary/NNP Clinton/NNP visited/VBD China/NNP last/JJ \ Monday/NNP ./PUNCT", "Hillary/NNP Clinton/NNP wanted/VBD to/TO visit/VB China/NNP \ last/JJ month/NN but/CC postponed/VBD her/PRP$ plans/NNS till/IN Monday/NNP \ last/JJ week/NN ./PUNCT", "Hillary/NNP Clinton/NNP paid/VBD a/DT visit/NN to/TO \ the/DT People/NNP Republic/NNP of/IN China/NNP on/IN Monday/NNP ./PUNCT", "Last/JJ week/NN the/DT Secretary/NNP of/IN State/NNP Ms./NNP Clinton/NNP \ visited/VBD Chinese/JJ officials/NNS ./PUNCT"] ################################################################################ # Create a word graph from the set of sentences with parameters : # - minimal number of words in the compression : 6 # - language of the input sentences : en (english) # - POS tag for punctuation marks : PUNCT compresser = takahe.word_graph( sentences, nb_words = 6, lang = 'en', punct_tag = "PUNCT" ) # Get the 50 best paths candidates = compresser.get_compression(1) # 1. Rerank compressions by path length (Filippova's method) for cummulative_score, path in candidates: # Normalize path score by path length normalized_score = cummulative_score / len(path) print(path) # Print normalized score and compression # print round(normalized_score, 3), ' '.join([u[0] for u in path]) # Write the word graph in the dot format
meeting_summary = [] for tagged_community in tagged_corpus[meeting_id]: # print "\t\t\ttagged_community_id:", tagged_corpus[meeting_id].index(tagged_community) compresser = takahe.word_graph( system_name=system_name, tagged_community=copy.copy(tagged_community), language=language, punct_tag=punct_tag, pos_separator=pos_separator, lm=lm, wv=wv, stopwords=stopwords, meeting_idf_dict=meeting_idf_dict, remove_stopwords=remove_stopwords, pos_filtering=pos_filtering, stemming=stemming, cr_w=cr_w, cr_weighted=cr_weighted, cr_overspanning=cr_overspanning, nb_words=nb_words, diversity_n_clusters=diversity_n_clusters, keyphrase_reranker_window_size=0, common_hyp_threshold_verb=0.9, common_hyp_threshold_nonverb=0.3) # Write the word graph in the dot format # compresser.write_dot('new.dot') loose_verb_constraint = False while True:
del textList[-1] #textList[1]=textList[1].lstrip() print(textList[2]) # In[55]: #!/usr/bin/python #sample code provided by “boudinfl/takahe.” GitHub,github.com/boudinfl/takahe # Create a word graph from the set of sentences with parameters : # - minimal number of words in the compression : 6 # - language of the input sentences : en (english) # - POS tag for punctuation marks : PUNCT compresser = takahe.word_graph(textList, nb_words=20, lang='en', punct_tag="PUNCT") # Get the 50 best paths candidates = compresser.get_compression(50) # 1. Rerank compressions by path length (Filippova's method) for cummulative_score, path in candidates: # Normalize path score by path length normalized_score = cummulative_score / len(path) # Print normalized score and compression print round(normalized_score, 3), ' '.join([u[0] for u in path]) # Write the word graph in the dot format