def text_rank(json_request): pattern = re.compile("TI - (.*?)\\r|AB - (.*?)\\r") matches = re.findall(pattern, json_request['ris']) all_inputs = [] for section in matches: all_inputs.append((''.join([word + ' ' for word in section])).strip()) input_json = {} input_json['id'] = "0" input_json['text'] = '.'.join(all_inputs) with open('ris_extracted.json', 'w') as output: json.dump(input_json, output) with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter('ris_extracted.json')): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) graph, ranks = pytextrank.text_rank(path_stage1) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) phrases = list([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=20)]) return phrases
def summarize(self, _id, content_text, word_limit): self.logger.log("_id: " + _id) self.logger.log("word_limit: " + str(word_limit)) # File names path_stage0 = 'process/' + _id + '.json' path_stage1 = 'process/' + _id + '_o1.json' path_stage2 = 'process/' + _id + '_o2.json' path_stage3 = 'process/' + _id + '_o3.json' path_stage4 = 'process/' + _id + '_o4.json' # Create input file with open(path_stage0, 'w') as outfile: json.dump({"id": "123", "text": content_text}, outfile) # Statistical Parsing - Stage 1 # Perform statistical parsing/tagging on a document in JSON format with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Ranked Keyphrases - Stage 2 # Collect and normalize the key phrases from a parsed document graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # Extractive Summarization - Stage 3 # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # Final Output - Stage 4 # Summarize a document based on most significant sentences and key phrases phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) return {'excerpts': graf_text, 'keywords': phrases}
def stage_2(): cur_dir = os.path.dirname(__file__) data_dir = stage_1_dir ids = os.listdir(data_dir) result_dir = stage_2_dir if os.path.exists(result_dir): shutil.rmtree(result_dir, ignore_errors=True) os.mkdir(result_dir) os.chdir(result_dir) if not os.path.exists('pictures'): os.mkdir('pictures') for cur_id in ids: if os.path.exists(cur_id): continue cur_file_name = data_dir + "\\" + cur_id print(cur_id) graph, ranks = pytextrank.text_rank(cur_file_name) pytextrank.render_ranks(graph, ranks, cur_id) with codecs.open(cur_id, "w+", "utf_8_sig") as file: for rl in pytextrank.normalize_key_phrases(cur_file_name, ranks): file.write("%s\n" % pytextrank.pretty_print(rl._asdict())) os.chdir(cur_dir)
def summarize_text(input_file): # seriously f**k this API path_stage0 = input_file path_stage1 = 'stage1.txt' with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(graf)) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) path_stage2 = 'stage2.txt' with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(rl)) path_stage3 = 'stage3.txt' kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook #print(pytextrank.pretty_print(s._asdict())) phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,)) return ' '.join(s)
def perform_statistical_parsing_tagging(self, text_file, paragraph_output): """ Perform statistical parsing and tagging of sentences in the text (aka JSON document) """ with open(paragraph_output, 'w') as temp_file: for paragraph in pytextrank.parse_doc(pytextrank.json_iter(text_file)): temp_file.write("%s\n" % pytextrank.pretty_print(paragraph._asdict()))
def RankedGraph(self, parse_list): graph, ranks = xang.text_rank(parse_list) norm_rank=xang.normalize_key_phrases(parse_list, ranks, self.stopwords, self.spacy_nlp, self.skip_ner) norm_rank_list=[json.loads(pretty_print(rl._asdict())) for rl in norm_rank ] phrases = ", ".join(set([p for p in xang.limit_keyphrases(norm_rank_list, self.phrase_limit)])) # return a matrix like result for the top keywords kernel = xang.rank_kernel(norm_rank_list) self.phrases = phrases self.kernel = kernel return kernel
def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output): """ Collect and normalise key phrases from the sentences in the paragraph (in the JSON doc) Rank them using PyTextRank, return a graph and ranked tokens """ graph, token_ranks = pytextrank.text_rank(paragraph_output) pytextrank.render_ranks(graph, token_ranks) with open(key_phrases_output, 'w') as temp_file: for relationship in pytextrank.normalize_key_phrases(paragraph_output, token_ranks): temp_file.write("%s\n" % pytextrank.pretty_print(relationship._asdict())) return graph, token_ranks
def calculate_sentence_significance(self, paragraph_output, key_phrases_output, \ top_sentences_output, top_n_sentences): """ Calculate the significance of each sentence based on the ranking. Ranking is determined by the top n sentences (filter) """ kernel = pytextrank.rank_kernel(key_phrases_output) with open(top_sentences_output, 'w') as temp_file: counter = 0 for sentence in pytextrank.top_sentences(kernel, paragraph_output): if counter < top_n_sentences: temp_file.write(pytextrank.pretty_print(sentence._asdict())) temp_file.write("\n") else: return counter = counter + 1
def stage_3(): cur_dir = os.path.dirname(__file__) data_dir = stage_1_dir ids = os.listdir(data_dir) result_dir = stage_3_dir if os.path.exists(result_dir): shutil.rmtree(result_dir, ignore_errors=True) os.mkdir(result_dir) os.chdir(result_dir) for cur_id in ids: print(cur_id) kernel = pytextrank.rank_kernel(stage_2_dir + '\\' + cur_id) with codecs.open(cur_id, "w+", "utf_8_sig") as file: for s in pytextrank.top_sentences(kernel, stage_1_dir + '\\' + cur_id): file.write(pytextrank.pretty_print(s._asdict())) file.write("\n") os.chdir(cur_dir)
def _get_keywords(path_stage0, path_stage2): # Stage 1: parse doc path_stage1 = 'o1.json' with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Stage 2: rank words graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) result_dict = dict() with open(path_stage2, 'w') as f2: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): _ro = rl._asdict() ro = dict() ro[_ro['text']] = _ro['rank'] #f2.write("%s\n" % pytextrank.pretty_print(ro)) result_dict[_ro['text']] = _ro['rank'] return result_dict
def stage_1(): cur_dir = os.path.dirname(__file__) data_dir = os.path.join(cur_dir, "..\cnn") db = StoriesCollection(data_dir) result_dir = stage_1_dir if os.path.exists(result_dir): shutil.rmtree(result_dir, ignore_errors=True) os.mkdir(result_dir) os.chdir(result_dir) while not db.was_cycle(): cur_corpus = db.get_next_corpus_textrank() cur_doc_name = db.get_cur_doc_name() print(cur_doc_name) with codecs.open(cur_doc_name[:-6] + '.json', "w+", "utf_8_sig") as file: for graf in parse_next_doc(cur_corpus, db.get_cur_id()): file.write("%s\n" % pytextrank.pretty_print(graf._asdict())) os.chdir(cur_dir)