コード例 #1
0
def text_rank(json_request):
	pattern = re.compile("TI  - (.*?)\\r|AB  - (.*?)\\r")
	matches = re.findall(pattern, json_request['ris'])
	all_inputs = []
	for section in matches:
	       all_inputs.append((''.join([word + ' ' for word in section])).strip())

	input_json = {}
	input_json['id'] = "0"
	input_json['text'] = '.'.join(all_inputs)

	with open('ris_extracted.json', 'w') as output:
	    json.dump(input_json, output)

	with open(path_stage1, 'w') as f:
	    for graf in pytextrank.parse_doc(pytextrank.json_iter('ris_extracted.json')):
	        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

	graph, ranks = pytextrank.text_rank(path_stage1)

	with open(path_stage2, 'w') as f:
	    for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
	        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

	phrases = list([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=20)])

	return phrases
コード例 #2
0
    def summarize(self, _id, content_text, word_limit):
        self.logger.log("_id: " + _id)
        self.logger.log("word_limit: " + str(word_limit))

        # File names
        path_stage0 = 'process/' + _id + '.json'
        path_stage1 = 'process/' + _id + '_o1.json'
        path_stage2 = 'process/' + _id + '_o2.json'
        path_stage3 = 'process/' + _id + '_o3.json'
        path_stage4 = 'process/' + _id + '_o4.json'

        # Create input file
        with open(path_stage0, 'w') as outfile:
            json.dump({"id": "123", "text": content_text}, outfile)

        # Statistical Parsing - Stage 1
        # Perform statistical parsing/tagging on a document in JSON format
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(
                    pytextrank.json_iter(path_stage0)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        # Ranked Keyphrases - Stage 2
        # Collect and normalize the key phrases from a parsed document
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)

        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

        # Extractive Summarization -  Stage 3
        # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank
        kernel = pytextrank.rank_kernel(path_stage2)

        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")

        # Final Output - Stage 4
        # Summarize a document based on most significant sentences and key phrases
        phrases = ", ".join(
            set([
                p for p in pytextrank.limit_keyphrases(path_stage2,
                                                       phrase_limit=12)
            ]))
        sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                      word_limit=word_limit),
                           key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)

        return {'excerpts': graf_text, 'keywords': phrases}
コード例 #3
0
def stage_2():
    cur_dir = os.path.dirname(__file__)
    data_dir = stage_1_dir
    ids = os.listdir(data_dir)

    result_dir = stage_2_dir
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir, ignore_errors=True)
    os.mkdir(result_dir)
    os.chdir(result_dir)

    if not os.path.exists('pictures'):
        os.mkdir('pictures')

    for cur_id in ids:
        if os.path.exists(cur_id):
            continue

        cur_file_name = data_dir + "\\" + cur_id
        print(cur_id)
        graph, ranks = pytextrank.text_rank(cur_file_name)
        pytextrank.render_ranks(graph, ranks, cur_id)

        with codecs.open(cur_id, "w+", "utf_8_sig") as file:
            for rl in pytextrank.normalize_key_phrases(cur_file_name, ranks):
                file.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

    os.chdir(cur_dir)
コード例 #4
0
def summarize_text(input_file):
    # seriously f**k this API
    path_stage0 = input_file
    path_stage1 = 'stage1.txt'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(graf))

    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    path_stage2 = 'stage2.txt'
    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(rl))

    path_stage3 = 'stage3.txt'
    kernel = pytextrank.rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
            # to view output in this notebook
            #print(pytextrank.pretty_print(s._asdict()))

    phrases = ", ".join(
        set([
            p
            for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12)
        ]))
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120),
                       key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))

    graf_text = " ".join(s)
    #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,))

    return ' '.join(s)
コード例 #5
0
    def perform_statistical_parsing_tagging(self, text_file, paragraph_output):
        """
            Perform statistical parsing and tagging of
            sentences in the text (aka JSON document)
        """

        with open(paragraph_output, 'w') as temp_file:
            for paragraph in pytextrank.parse_doc(pytextrank.json_iter(text_file)):
                temp_file.write("%s\n" % pytextrank.pretty_print(paragraph._asdict()))
コード例 #6
0
    def RankedGraph(self, parse_list):
        graph, ranks = xang.text_rank(parse_list)
        norm_rank=xang.normalize_key_phrases(parse_list, ranks, self.stopwords, self.spacy_nlp, self.skip_ner)
        norm_rank_list=[json.loads(pretty_print(rl._asdict())) for rl in norm_rank ]
        phrases = ", ".join(set([p for p in xang.limit_keyphrases(norm_rank_list, self.phrase_limit)]))

        # return a matrix like result for the top keywords
        kernel = xang.rank_kernel(norm_rank_list)
        self.phrases = phrases
        self.kernel = kernel

        return kernel
コード例 #7
0
    def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output):
        """
            Collect and normalise key phrases from the sentences in
            the paragraph (in the JSON doc)
            Rank them using PyTextRank, return a graph and ranked tokens
        """

        graph, token_ranks = pytextrank.text_rank(paragraph_output)
        pytextrank.render_ranks(graph, token_ranks)

        with open(key_phrases_output, 'w') as temp_file:
            for relationship in pytextrank.normalize_key_phrases(paragraph_output, token_ranks):
                temp_file.write("%s\n" % pytextrank.pretty_print(relationship._asdict()))

        return graph, token_ranks
コード例 #8
0
    def calculate_sentence_significance(self, paragraph_output, key_phrases_output, \
                                        top_sentences_output, top_n_sentences):
        """
            Calculate the significance of each sentence based on the ranking.
            Ranking is determined by the top n sentences (filter)
        """

        kernel = pytextrank.rank_kernel(key_phrases_output)

        with open(top_sentences_output, 'w') as temp_file:
            counter = 0
            for sentence in pytextrank.top_sentences(kernel, paragraph_output):
                if counter < top_n_sentences:
                    temp_file.write(pytextrank.pretty_print(sentence._asdict()))
                    temp_file.write("\n")
                else:
                    return

                counter = counter + 1
コード例 #9
0
def stage_3():
    cur_dir = os.path.dirname(__file__)
    data_dir = stage_1_dir
    ids = os.listdir(data_dir)

    result_dir = stage_3_dir
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir, ignore_errors=True)
    os.mkdir(result_dir)
    os.chdir(result_dir)

    for cur_id in ids:
        print(cur_id)
        kernel = pytextrank.rank_kernel(stage_2_dir + '\\' + cur_id)
        with codecs.open(cur_id, "w+", "utf_8_sig") as file:
            for s in pytextrank.top_sentences(kernel,
                                              stage_1_dir + '\\' + cur_id):
                file.write(pytextrank.pretty_print(s._asdict()))
                file.write("\n")
    os.chdir(cur_dir)
コード例 #10
0
def _get_keywords(path_stage0, path_stage2):
    # Stage 1: parse doc
    path_stage1 = 'o1.json'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    # Stage 2: rank words
    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    result_dict = dict()
    with open(path_stage2, 'w') as f2:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            _ro = rl._asdict()
            ro = dict()
            ro[_ro['text']] = _ro['rank']
            #f2.write("%s\n" % pytextrank.pretty_print(ro))

            result_dict[_ro['text']] = _ro['rank']

    return result_dict
コード例 #11
0
def stage_1():
    cur_dir = os.path.dirname(__file__)
    data_dir = os.path.join(cur_dir, "..\cnn")
    db = StoriesCollection(data_dir)

    result_dir = stage_1_dir
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir, ignore_errors=True)
    os.mkdir(result_dir)
    os.chdir(result_dir)

    while not db.was_cycle():
        cur_corpus = db.get_next_corpus_textrank()
        cur_doc_name = db.get_cur_doc_name()

        print(cur_doc_name)

        with codecs.open(cur_doc_name[:-6] + '.json', "w+",
                         "utf_8_sig") as file:
            for graf in parse_next_doc(cur_corpus, db.get_cur_id()):
                file.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    os.chdir(cur_dir)