Beispiel #1
0
def lda_test(x, n_topics, n_top_sent, sent_length, num_best, vocab, sentences):
    model = lda.LDA(n_topics=n_topics, n_iter=800, random_state=1)
    model.fit_transform(x)
    topic_word = model.topic_word_
    n_top_words = 5
    final1 = ''
    final_out = ''
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    n_top_sent = n_top_sent
    s = []
    for i, doc_dist in enumerate(model.doc_topic_.transpose()):
        s1 = []
        topic_words = np.array(sentences)[np.argsort(doc_dist)][:-n_top_words:-1]
        print('{}: {}'.format(i, ' '.join(topic_words)))
        # ff = f1_score(x, topic_words)
        # print(ff)
        t = textblob.TextBlob(' '.join(topic_words))
        for sent in t.sentences:
            t = parse_sentence(sent)
            s1.append(t)
        s.append(s1)
    for s1 in s:
        print(s1)
        print("------")
        compresser = takahe.word_graph(s1, nb_words=sent_length, lang='en', punct_tag="PUNCT")
        # Get the 50 best paths
        candidates = compresser.get_compression(num_best)
        # 1. Rerank compressions by path length (Filippova's method)
        maxim1 = 0
        best_cand1 = ''
        for cummulative_score, path in candidates:
            if cummulative_score > maxim1:
                maxim1 = cummulative_score
                best_cand1 = ' '.join([u[0] for u in path])
            # Normalize path score by path length
            normalized_score = cummulative_score / len(path)
        print('Best: ')
        print(best_cand1)
        final1 += best_cand1
        # Write the word graph in the dot format
        compresser.write_dot('test.dot')
        # 2. Rerank compressions by keyphrases (Boudin and Morin's method)
        reranker = takahe.keyphrase_reranker(s1, candidates, lang='en')
        reranked_candidates = reranker.rerank_nbest_compressions()
        # Loop over the best reranked candidates
        maxim = 0
        best_cand = ''
        for score, path in reranked_candidates:
            # Print the best reranked candidates
            if score > maxim:
                maxim = score
                best_cand = ' '.join([u[0] for u in path])
        print('Best: ')
        print(best_cand)
        final_out += best_cand
    print(final1)
    print(final_out)
    return final1
def keyphrases_based_msc(sentences, output_sent_num = 50):

    """
    经过keyphrases重排序后的多语句压缩
    :param sentences:
    :param output_sent_num:
    :return:
    """

    # 构建词图,并执行压缩
    # 忽略词数小于8的句子
    compresser = takahe.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT")

    # 获取压缩结果
    candidates = compresser.get_compression(output_sent_num)

    # 利用keyphrases对压缩结果重新打分
    reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en')
    reranked_candidates = reranker.rerank_nbest_compressions()

    results = []
    for score, path in reranked_candidates:
        results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n')

    return results
def protogenesis_msc(sentences, output_sent_num = 50):

    """
    原生多语句压缩
    :param sentences: 待压缩的输入语句集合
    :param output_sent_num: 输出语句的个数,默认50句
    :return: 分数#句子
    """

    # 构建词图,并执行压缩
    # 忽略词数小于8的句子
    compresser = takahe.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT")

    # 获取压缩结果
    candidates = compresser.get_compression(output_sent_num)

    # 对压缩结果进行归一化
    tmp = []
    for score, path in candidates:
        tmp.append((score / len(path), path))

    # 按照得分排序
    tmp = sorted(tmp, key = lambda tmp : tmp[0])

    # 封装结果返回
    results = []
    for score, path in tmp:
        results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n')

    return results
Beispiel #4
0
def _dofuse(cluster):
    """
    Extracts the call to takahe to interrupt it if it's taking too long.
    """
    fuser = takahe.word_graph(cluster,
                              nb_words=6,
                              lang="en",
                              punct_tag="PUNCT")
    # get fusions
    fusions = fuser.get_compression(50)
    # rerank and keep top 10
    reranker = takahe.keyphrase_reranker(cluster, fusions, lang="en")
    rerankedfusions = reranker.rerank_nbest_compressions()[0:10]
    return rerankedfusions
def _dofuse(sentenceL):
    """
	Extracts the call to takahe to interrupt it if it's taking too long.
	"""
    fuser = takahe.word_graph(sentenceL,
                              nb_words=6,
                              lang="en",
                              punct_tag="PUNCT")
    # get fusions
    fusions = fuser.get_compression(50)
    # rerank and keep top 10
    reranker = takahe.keyphrase_reranker(sentenceL, fusions, lang="en")
    rerankedfusions = reranker.rerank_nbest_compressions()[0:5]
    return rerankedfusions
Beispiel #6
0
def get_compressed_sen(sentences, nb_words):
    compresser = takahe.word_graph(sentences, nb_words = nb_words, lang = 'en', punct_tag = "." )
    candidates = compresser.get_compression(3)
    # print("--------------------Top 3 candicate---------------", candidates)
    reranker = takahe.keyphrase_reranker(sentences,
                                      candidates,
                                      lang = 'en')
    # print("reranker: ", reranker)
    # print("finish initialising reranker------------")

    reranked_candidates = reranker.rerank_nbest_compressions()
    # print(reranked_candidates)
    if len(reranked_candidates)>0:
        score, path = reranked_candidates[0]
        result = ' '.join([u[0] for u in path])
    else:
        result=' '
    # print("----------------selected candicate as final output-------------- ", result)
    return result
def worker(system_name, param):
    param_id = param['index']
    remove_stopwords = param['remove_stopwords']
    pos_filtering = param['pos_filtering']
    stemming = param['stemming']
    cr_w = param['cr_w']
    cr_weighted = param['cr_weighted']
    cr_overspanning = param['cr_overspanning']
    nb_words = param['nb_words']
    diversity_n_clusters = param['diversity_n_clusters']

    # ##########################
    # ### LOOP OVER MEETINGS ###
    # ##########################
    for meeting_id in ids:
        # print "\t\tmeeting_id:", meeting_id

        # #############################
        # ### IDFS (meeting level)  ###
        # #############################
        # consider community as DOCUMENT, meeting as CORPUS
        # idf is based on lower_case form
        tokenized_document_list = []
        for tagged_community in tagged_corpus[meeting_id]:
            tagged_document = ' '.join(tagged_community)
            cleaned_tagged_document = utils.clean_tagged_text(
                tagged_document,
                stopwords,
                remove_stopwords=remove_stopwords,
                pos_filtering=pos_filtering,
                stemming=stemming,
                lower_case=True,
                pos_separator=pos_separator,
                punct_tag=punct_tag)
            cleaned_document = utils.remove_tags_from_text(
                cleaned_tagged_document)
            tokenized_document_list.append(cleaned_document.split(' '))
        meeting_idf_dict = tf_idf.inverse_document_frequencies(
            tokenized_document_list)

        # #############################
        # ### LOOP OVER COMMUNITIES ###
        # #############################
        meeting_summary = []

        for tagged_community in tagged_corpus[meeting_id]:
            # print "\t\t\ttagged_community_id:", tagged_corpus[meeting_id].index(tagged_community)

            compresser = takahe.word_graph(
                system_name=system_name,
                tagged_community=copy.copy(tagged_community),
                language=language,
                punct_tag=punct_tag,
                pos_separator=pos_separator,
                lm=lm,
                wv=wv,
                stopwords=stopwords,
                meeting_idf_dict=meeting_idf_dict,
                remove_stopwords=remove_stopwords,
                pos_filtering=pos_filtering,
                stemming=stemming,
                cr_w=cr_w,
                cr_weighted=cr_weighted,
                cr_overspanning=cr_overspanning,
                nb_words=nb_words,
                diversity_n_clusters=diversity_n_clusters,
                keyphrase_reranker_window_size=0,
                common_hyp_threshold_verb=0.9,
                common_hyp_threshold_nonverb=0.3)

            # Write the word graph in the dot format
            # compresser.write_dot('new.dot')
            loose_verb_constraint = False
            while True:
                # Get the 200 best paths
                candidates = compresser.get_compression(
                    nb_candidates=200,
                    loose_verb_constraint=loose_verb_constraint)
                if len(candidates) > 0:
                    final_paths = compresser.final_score(candidates,
                                                         1)  # n_results
                    meeting_summary.append(final_paths[0][1])
                    break
                # Then reason of no candidate:
                # 1. minimum number of words allowed in the compression larger than
                # the maximum path length in graph, then decrease nb_words and diversity_n_clusters
                else:
                    compresser.nb_words -= 1
                    if compresser.nb_words == 0:
                        # 2. path should contain at least one verb, but no verb presented in the community
                        # in this case, then loose the verb constraint
                        loose_verb_constraint = True
                        # raise RuntimeError("MSC failed")

        # ######################
        # ### OUTPUT SUMMARY ###
        # ######################
        output_path = path_to_root + 'results/' + domain + '/' + dataset_id + '_' + str(
            corpus_id
        ) + '/' + development_or_test + '/' + system_name + '/' + str(
            param_id) + '/'
        if not os.path.exists(output_path):
            os.makedirs(output_path)

        output = '\n'.join(meeting_summary)
        # output = ''.join([l for l in output if l not in string.punctuation])
        output = re.sub(' +', ' ', output).strip()  # .lower()

        # write full summary
        file_path_name = output_path + meeting_id + '_' + system_name + '.txt'
        f = open(file_path_name, 'w')
        f.write(output)
        f.close()

        # # write trucated summaries with different sizes
        # for summary_size in range(50, 550, 50):
        #     file_path_name = output_path + meeting_id + '_' + system_name + '-' + str(summary_size) + '.txt'
        #     f = open(file_path_name, 'w')
        #     cut = ' '.join(output.split(' ')[:summary_size]).replace(' \n', '\n')
        #     f.write(cut)
        #     f.close()

    print '\t' + system_name, param_id
Beispiel #8
0
def compression(communities, stopwords, wv, lm, config, language):
    param = config['MSC']

    pos_separator = '/'
    punct_tag = 'PUNCT'

    # #############################
    # ### IDFS (meeting level)  ###
    # #############################
    # consider community as DOCUMENT, meeting as CORPUS
    # idf is based on lower_case form
    tokenized_document_list = []
    for tagged_community in communities:
        tagged_document = ' '.join(tagged_community)
        cleaned_tagged_document = utils.clean_tagged_text(
            tagged_document, stopwords,
            remove_stopwords=param.getboolean('remove_stopwords'), pos_filtering=param.getboolean('pos_filtering'),
            stemming=param.getboolean('stemming'), lower_case=True,
            pos_separator=pos_separator, punct_tag=punct_tag
        )
        cleaned_document = utils.remove_tags_from_text(cleaned_tagged_document)
        tokenized_document_list.append(cleaned_document.split(' '))
    meeting_idf_dict = utils.inverse_document_frequencies(tokenized_document_list)

    # #############################
    # ### LOOP OVER COMMUNITIES ###
    # #############################
    compressions = []
    graphs = []

    for tagged_community in communities:
        # print "\t\t\ttagged_community_id:", tagged_corpus[meeting_id].index(tagged_community)

        compresser = takahe.word_graph(
            system_name=param.get('system_name'),
            tagged_community=copy.copy(tagged_community),
            language=language,
            punct_tag=punct_tag,
            pos_separator=pos_separator,

            lm=lm,
            wv=wv,
            stopwords=stopwords,
            meeting_idf_dict=meeting_idf_dict,

            remove_stopwords=param.getboolean('remove_stopwords'),
            pos_filtering=param.getboolean('pos_filtering'),
            stemming=param.getboolean('stemming'),
            cr_w=param.getint('w'),
            cr_weighted=param.getboolean('weighted'),
            cr_overspanning=param.getboolean('overspanning'),
            nb_words=param.getint('nb_words'),
            diversity_n_clusters=param.getint('diversity_n_clusters'),

            keyphrase_reranker_window_size=0,
            common_hyp_threshold_verb=0.9,
            common_hyp_threshold_nonverb=0.3
        )

        # Write the word graph in the dot format
        # compresser.write_dot('new.dot')
        loose_verb_constraint = False
        while True:
            # Get the 200 best paths
            candidates = compresser.get_compression(nb_candidates=200, loose_verb_constraint=loose_verb_constraint)
            if len(candidates) > 0:
                final_paths = compresser.final_score(candidates, 1)  # n_results
                compressions.append(final_paths[0][1])
                graphs.append({
                    'nodes': compresser.graph.nodes(),
                    'edges': compresser.graph.edges()
                })
                break
            # Then reason of no candidate:
            # 1. minimum number of words allowed in the compression larger than
            # the maximum path length in graph, then decrease nb_words and diversity_n_clusters
            else:
                compresser.nb_words -= 1
                if compresser.nb_words == 0:
                    # 2. path should contain at least one verb, but no verb presented in the community
                    # in this case, then loose the verb constraint
                    loose_verb_constraint = True
                    # raise RuntimeError("MSC failed")

    return compressions, graphs
Beispiel #9
0
################################################################################
sentences = ["The/DT wife/NN of/IN a/DT former/JJ U.S./NNP president/NN \
Bill/NNP Clinton/NNP Hillary/NNP Clinton/NNP visited/VBD China/NNP last/JJ \
Monday/NNP ./PUNCT", "Hillary/NNP Clinton/NNP wanted/VBD to/TO visit/VB China/NNP \
last/JJ month/NN but/CC postponed/VBD her/PRP$ plans/NNS till/IN Monday/NNP \
last/JJ week/NN ./PUNCT", "Hillary/NNP Clinton/NNP paid/VBD a/DT visit/NN to/TO \
the/DT People/NNP Republic/NNP of/IN China/NNP on/IN Monday/NNP ./PUNCT",
"Last/JJ week/NN the/DT Secretary/NNP of/IN State/NNP Ms./NNP Clinton/NNP \
visited/VBD Chinese/JJ officials/NNS ./PUNCT"]
################################################################################

# Create a word graph from the set of sentences with parameters :
# - minimal number of words in the compression : 6
# - language of the input sentences : en (english)
# - POS tag for punctuation marks : PUNCT
compresser = takahe.word_graph(sentences, nb_words=6, lang='en', punct_tag="PUNCT")

# Get the 50 best paths
candidates = compresser.get_compression(50)

# 1. Rerank compressions by path length (Filippova's method)
for cummulative_score, path in candidates:
    # Normalize path score by path length
    normalized_score = cummulative_score / len(path)

    # Print normalized score and compression
    print round(normalized_score, 3), ' '.join([u[0] for u in path])

# Write the word graph in the dot format
compresser.write_dot('test.dot')
Beispiel #10
0
sentences = ["The/DT wife/NN of/IN a/DT former/JJ U.S./NNP president/NN \
Bill/NNP Clinton/NNP Hillary/NNP Clinton/NNP visited/VBD China/NNP last/JJ \
Monday/NNP ./PUNCT", "Hillary/NNP Clinton/NNP wanted/VBD to/TO visit/VB China/NNP \
last/JJ month/NN but/CC postponed/VBD her/PRP$ plans/NNS till/IN Monday/NNP \
last/JJ week/NN ./PUNCT", "Hillary/NNP Clinton/NNP paid/VBD a/DT visit/NN to/TO \
the/DT People/NNP Republic/NNP of/IN China/NNP on/IN Monday/NNP ./PUNCT",
"Last/JJ week/NN the/DT Secretary/NNP of/IN State/NNP Ms./NNP Clinton/NNP \
visited/VBD Chinese/JJ officials/NNS ./PUNCT"]
################################################################################

# Create a word graph from the set of sentences with parameters :
# - minimal number of words in the compression : 6
# - language of the input sentences : en (english)
# - POS tag for punctuation marks : PUNCT
compresser = takahe.word_graph( sentences,
							    nb_words = 6,
	                            lang = 'en',
	                            punct_tag = "PUNCT" )

# Get the 50 best paths
candidates = compresser.get_compression(1)

# 1. Rerank compressions by path length (Filippova's method)
for cummulative_score, path in candidates:

	# Normalize path score by path length
	normalized_score = cummulative_score / len(path)
	print(path)
	# Print normalized score and compression
	# print round(normalized_score, 3), ' '.join([u[0] for u in path])

# Write the word graph in the dot format
Beispiel #11
0
                meeting_summary = []

                for tagged_community in tagged_corpus[meeting_id]:
                    # print "\t\t\ttagged_community_id:", tagged_corpus[meeting_id].index(tagged_community)

                    compresser = takahe.word_graph(
                        system_name=system_name,
                        tagged_community=copy.copy(tagged_community),
                        language=language,
                        punct_tag=punct_tag,
                        pos_separator=pos_separator,
                        lm=lm,
                        wv=wv,
                        stopwords=stopwords,
                        meeting_idf_dict=meeting_idf_dict,
                        remove_stopwords=remove_stopwords,
                        pos_filtering=pos_filtering,
                        stemming=stemming,
                        cr_w=cr_w,
                        cr_weighted=cr_weighted,
                        cr_overspanning=cr_overspanning,
                        nb_words=nb_words,
                        diversity_n_clusters=diversity_n_clusters,
                        keyphrase_reranker_window_size=0,
                        common_hyp_threshold_verb=0.9,
                        common_hyp_threshold_nonverb=0.3)

                    # Write the word graph in the dot format
                    # compresser.write_dot('new.dot')
                    loose_verb_constraint = False
                    while True:
Beispiel #12
0
del textList[-1]
#textList[1]=textList[1].lstrip()
print(textList[2])

# In[55]:

#!/usr/bin/python

#sample code provided by “boudinfl/takahe.” GitHub,github.com/boudinfl/takahe

# Create a word graph from the set of sentences with parameters :
# - minimal number of words in the compression : 6
# - language of the input sentences : en (english)
# - POS tag for punctuation marks : PUNCT
compresser = takahe.word_graph(textList,
                               nb_words=20,
                               lang='en',
                               punct_tag="PUNCT")

# Get the 50 best paths
candidates = compresser.get_compression(50)

# 1. Rerank compressions by path length (Filippova's method)
for cummulative_score, path in candidates:

    # Normalize path score by path length
    normalized_score = cummulative_score / len(path)

    # Print normalized score and compression
    print round(normalized_score, 3), ' '.join([u[0] for u in path])

# Write the word graph in the dot format