def generateGraph(text, outputfile, outputdir, plotGraph=False): print('Generating Graph...') #Start by doing statistical parsing/tagging for temp_file = os.path.join(outputdir, 'temp.json') path_stage1 = os.path.join(outputdir, outputfile.split("_")[0] + '_o1.json') txtToJson.textTojson(text, temp_file) with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(temp_file)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) #Collect and Normalize the key sentences from the parsed doc graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) #path_stage2 = path_stage1.replace('o1', 'o2') path_stage2 = os.path.join(outputdir, outputfile) try: os.remove(outputfile) except OSError: pass with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) #print(pytextrank.pretty_print(rl)) try: os.remove(temp_file) except OSError: pass if plotGraph: matplotlib.rcParams['figure.figsize'] = (15.0, 15.0) networkx.draw_networkx(graph) plt.show() nx.draw(graph, with_labels=True) plt.show()
def one(text): path_stage0 = "tempfile.json" path_stage1 = "o1.json" path_stage2 = "o2.json" f = open("tempfile.json", "w") f.write("{\"id\":\"777\", \"text\":\"" + text + "\"}") f.close() with open(path_stage1, 'w') as f: for graf in parse_doc(json_iter(path_stage0)): f.write("%s\n" % pretty_print(graf._asdict())) graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) outputs = [] with open(path_stage2, 'w') as f: for rl in normalize_key_phrases(path_stage1, ranks): ans = "%s\n" % pretty_print(rl._asdict()) output = ast.literal_eval(ans) outputs.append((output["text"], output["rank"])) os.remove("tempfile.json") return outputs # text = "The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth." # print (one("The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."))
def stage_2(): cur_dir = os.path.dirname(__file__) data_dir = stage_1_dir ids = os.listdir(data_dir) result_dir = stage_2_dir if os.path.exists(result_dir): shutil.rmtree(result_dir, ignore_errors=True) os.mkdir(result_dir) os.chdir(result_dir) if not os.path.exists('pictures'): os.mkdir('pictures') for cur_id in ids: if os.path.exists(cur_id): continue cur_file_name = data_dir + "\\" + cur_id print(cur_id) graph, ranks = pytextrank.text_rank(cur_file_name) pytextrank.render_ranks(graph, ranks, cur_id) with codecs.open(cur_id, "w+", "utf_8_sig") as file: for rl in pytextrank.normalize_key_phrases(cur_file_name, ranks): file.write("%s\n" % pytextrank.pretty_print(rl._asdict())) os.chdir(cur_dir)
def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output): """ Collect and normalise key phrases from the sentences in the paragraph (in the JSON doc) Rank them using PyTextRank, return a graph and ranked tokens Parameters ========== paragraph_output: tagged and parsed JSON document as text file key_phrases_output: output text file (JSON) into which key phrases are stored Return ====== Returns a graph (object) and ranked tokens (dictionary) """ graph, token_ranks = pytextrank.text_rank(paragraph_output) pytextrank.render_ranks(graph, token_ranks) with open(key_phrases_output, 'w') as temp_file: for relationship in pytextrank.normalize_key_phrases( paragraph_output, token_ranks): temp_file.write( "%s\n" % pytextrank.pretty_print(relationship._asdict())) return graph, token_ranks
def stage2(path_stage1, path_stage2): #Stage 2 graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
def summarize(self, _id, content_text, word_limit): self.logger.log("_id: " + _id) self.logger.log("word_limit: " + str(word_limit)) # File names path_stage0 = 'process/' + _id + '.json' path_stage1 = 'process/' + _id + '_o1.json' path_stage2 = 'process/' + _id + '_o2.json' path_stage3 = 'process/' + _id + '_o3.json' path_stage4 = 'process/' + _id + '_o4.json' # Create input file with open(path_stage0, 'w') as outfile: json.dump({"id": "123", "text": content_text}, outfile) # Statistical Parsing - Stage 1 # Perform statistical parsing/tagging on a document in JSON format with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Ranked Keyphrases - Stage 2 # Collect and normalize the key phrases from a parsed document graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # Extractive Summarization - Stage 3 # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # Final Output - Stage 4 # Summarize a document based on most significant sentences and key phrases phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) return {'excerpts': graf_text, 'keywords': phrases}
def pred_net(sample_case): import numpy as np, keras from pathlib import Path from spacy import displacy from PIL import Image import json, pytextrank, networkx as nx import matplotlib.pyplot as plt path_stage0 = "o0.json" path_stage1 = "o1.json" file_dic = {"id": 0, "text": sample_case} loaded_file_dic = json.loads(json.dumps(file_dic)) with open(path_stage0, 'w') as outfile: json.dump(loaded_file_dic, outfile) with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) print(pytextrank.pretty_print(graf._asdict())) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) nx.draw(graph, with_labels=True) plt.savefig("sample_case.png", dpi=200, format='png', bbox_inches='tight') plt.close() im = Image.open("sample_case.png").convert('L').resize((300, 200)) sample_image = np.array([np.array(im)]) sample_image = sample_image.reshape(sample_image.shape[0], sample_image.shape[1], sample_image.shape[2], 1) model = keras.models.load_model("graph_conv_autoencoder.hdf5") y_pred = model.predict(sample_image) labels = [ 'Major Depressive Disorder', 'Attention Deficit Hyperactivity Disorder', 'Oppositional Defiant Disorder', 'Conduct Disorder', 'Pervasive Developmental Disorder', 'Intellectual Disability (Mental Retardation)', 'Psychotic Disorder', 'Adjustment Disorder', 'Mood Disorder', 'General Anxiety Disorder', 'Social Anxiety Disorder', 'Seasonal Affective Disorder', 'Substance Abuse', 'Autism Spectrum Disorder' ] max1 = labels[np.argmax(y_pred)] with open('external_resources.json') as data_file: for v in json.load(data_file): if v['diagnosis'] == max1: about1, treatment1 = v['about'], v['treatment'] return (max1, about1, treatment1)
def execute_stage_two(path_stage1): graph, ranks = pytextrank.text_rank(os.path.join(PATH_PREFIX, path_stage1)) pytextrank.render_ranks(graph, ranks) path_name_components = path_stage1.split('.') path_name_components[path_name_components.index('stage1')] = 'stage2' path_stage2 = '-'.join(path_name_components) with open(os.path.join(PATH_PREFIX, path_stage2), 'w') as f: for rl in pytextrank.normalize_key_phrases(os.path.join(PATH_PREFIX, path_stage1), ranks, stopwords=stopwords): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) return path_stage2
def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output): graph, token_ranks = pytextrank.text_rank(paragraph_output) pytextrank.render_ranks(graph, token_ranks) with open(key_phrases_output, 'w') as f: for relationship in pytextrank.normalize_key_phrases( paragraph_output, token_ranks): f.write("%s\n" % pytextrank.pretty_print(relationship._asdict())) return graph, token_ranks
def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output): """ Collect and normalise key phrases from the sentences in the paragraph (in the JSON doc) Rank them using PyTextRank, return a graph and ranked tokens """ graph, token_ranks = pytextrank.text_rank(paragraph_output) pytextrank.render_ranks(graph, token_ranks) with open(key_phrases_output, 'w') as temp_file: for relationship in pytextrank.normalize_key_phrases(paragraph_output, token_ranks): temp_file.write("%s\n" % pytextrank.pretty_print(relationship._asdict())) return graph, token_ranks
def summarize_text(input_file): # seriously f**k this API path_stage0 = input_file path_stage1 = 'stage1.txt' with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(graf)) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) path_stage2 = 'stage2.txt' with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(rl)) path_stage3 = 'stage3.txt' kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook #print(pytextrank.pretty_print(s._asdict())) phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,)) return ' '.join(s)
def _get_keywords(path_stage0, path_stage2): # Stage 1: parse doc path_stage1 = 'o1.json' with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Stage 2: rank words graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) result_dict = dict() with open(path_stage2, 'w') as f2: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): _ro = rl._asdict() ro = dict() ro[_ro['text']] = _ro['rank'] #f2.write("%s\n" % pytextrank.pretty_print(ro)) result_dict[_ro['text']] = _ro['rank'] return result_dict
def rank_bill(bill): bill_id = bill['bill_id'] with open(prefix + '/{}_stage1'.format(bill_id), 'w') as f: for graf in parse_doc([bill]): f.write(pretty_print(graf._asdict())) f.write('\n') path_stage1 = prefix + '/{}_stage1'.format(bill_id) graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) for rl in normalize_key_phrases(path_stage1, ranks): output = pretty_print(rl._asdict()) with open(prefix + '/{}_stage2'.format(bill_id), 'w') as f: f.write(output) path_stage1 = prefix + '/{}_stage1'.format(bill_id) path_stage2 = prefix + '/{}_stage2'.format(bill_id) kernel = rank_kernel(path_stage2) with open(prefix + '/{}_stage3'.format(bill_id), 'w') as f: for s in top_sentences(kernel, path_stage1): f.write(pretty_print(s._asdict()))
def do_pytextrank(data): for item in data: for subItem in data[item]: print('###############') print('description:', subItem['description']) # using pytextrank # reference https://github.com/ceteri/pytextrank/issues/18 # raw input subItemJSON = {'id': subItem['id'], 'text': subItem['description']} subItemJSON = json.dumps(subItemJSON) with open('sub_item.json', 'w') as outFile: outFile.write(subItemJSON) # stage 1 with open('stage1_output.json', 'w') as outFile: for graf in pytextrank.parse_doc( pytextrank.json_iter('sub_item.json')): outFile.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # stage 2 graph, ranks = pytextrank.text_rank('stage1_output.json') pytextrank.render_ranks(graph, ranks) rlLists = [] print('key phrases:') with open('stage2_output.json', 'w') as outFile: for rl in pytextrank.normalize_key_phrases( 'stage1_output.json', ranks): rlList = eval(pytextrank.pretty_print(rl)) rlLists.append(rlList) print(rlList) # cleanup os.system( 'rm -f sub_item.json stage1_output.json stage2_output.json graph.dot' ) # input filter results based on pos # this is a heuristic filteredRlLists = [x for x in rlLists if 'nn' not in x[-2]] if (len(filteredRlLists) == 0): # invalid case continue else: [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists) print('heuristic:', heuristic) print('i/o input:', iOItem) # input filter results based on pos # this is a heuristic filteredRlLists = [x for x in rlLists if 'nn' in x[-2]] if (len(filteredRlLists) == 0): # invalid case continue else: [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists) print('heuristic:', heuristic) print('i/o output:', iOItem) print('###############')
def extract_phrasesfrom_textrank(corpus): record_data = pd.DataFrame({'sentences': corpus}) record_data = pd.DataFrame({ 'id': record_data.index.tolist(), 'text': record_data['sentences'].tolist() }) tweet_items = [] for jdict in record_data.to_dict(orient='records'): tweet_items.append(jdict) new_df_tweet = pd.DataFrame(columns=['text', 'keywords']) path_stage1 = "celebrity1_tweet.json" path_stage2 = "celebrity2_tweet.json" path_stage3 = "celebrity3_tweet.json" for item in tweet_items: items_new = [item] with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(items_new): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=5) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) new_df_tweet = new_df_tweet.append( { 'text': item.get('text'), 'keywords': phrases }, ignore_index=True) celeb_list = [ 'Bradley Cooper', 'Chris Kyle', 'Clint Eastwood', 'bradley cooper', 'bradley', 'cooper', 'chris kyle', 'chris', 'kyle', 'clint eastwood', 'clint', 'eastwood' ] cleaned_df_tweet = pd.DataFrame(columns=['sentences', 'keywords']) for index, row in new_df_tweet.iterrows(): if any(celeb in row['keywords'] for celeb in celeb_list): cleaned_df_tweet = cleaned_df_tweet.append( { 'sentences': row['text'], 'keywords': row['keywords'] }, ignore_index=True) cleaned_df_tweet.to_csv(phrase_filepath, sep=',', encoding='utf-8', index=False) new_df_tweet.to_csv(all_phrasefile_path, sep=',', encoding='utf-8', index=False) return new_df_tweet, cleaned_df_tweet
#!/usr/bin/env python # encoding: utf-8 from pytextrank import normalize_key_phrases, pretty_print, render_ranks, text_rank import sys ## Stage 2: ## * collect and normalize the key phrases from a parsed document ## ## INPUTS: <stage1> ## OUTPUT: JSON format `RankedLexeme(text, rank, ids, pos)` if __name__ == "__main__": path_stage1 = sys.argv[1] graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) for rl in normalize_key_phrases(path_stage1, ranks): # print (rl) print(pretty_print(rl._asdict()))
# Stage 1 path_stage0 = "../tests/pytextrank_dat/mih.json" path_stage1 = "../tests/pytextrank_dat/o1.json" with open(path_stage1, 'w') as f: for graf in ptr.parse_doc(ptr.json_iter(path_stage0)): f.write("%s\n" % ptr.pretty_print(graf._asdict())) print(ptr.pretty_print(graf)) # Stage 2 path_stage2 = "../tests/pytextrank_dat/o2.json" graph, ranks = ptr.text_rank(path_stage1) ptr.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in ptr.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % ptr.pretty_print(rl._asdict())) print(ptr.pretty_print(rl)) # Stage 3 import networkx as nx # import pylab as plt nx.draw(graph, with_labels=True) # plt.show() path_stage3 = "../tests/pytextrank_dat/o3.json"
def keyPhrases(): graph, ranks = pytextrank.text_rank('temp2.json') pytextrank.render_ranks(graph, ranks) with open('temp3.json', 'w') as f: for rl in pytextrank.normalize_key_phrases('temp2.json', ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
json.dump(loaded_file_dic, outfile) path_stage0 = "test.json" path_stage1 = "o1.json" # Extract keyword using pytextrank with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) #print(pytextrank.pretty_print(graf._asdict())) path_stage1 = "o1.json" path_stage2 = "o2.json" graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) #print(pytextrank.pretty_print(rl)) path_stage1 = "o1.json" path_stage2 = "o2.json" path_stage3 = "o3.json" kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict()))
def insert_key_phrases_into_db(list_of_doc_dicts, doctype, collection): ''' Takes in list of doc dictionaries and a doctype ('comment' or 'post'), processes each doc with PyTextRank, obtains key phrases and inserts key phrases into document in Mongodb as 'key_phrases' field. ''' path_stage0 = 'stage0.json' path_stage1 = 'stage1.json' path_stage2 = 'stage2.json' path_stage3 = 'stage3.json' total_docs = len(list_of_doc_dicts) failed_ids = [] for i, doc_dict in enumerate(list_of_doc_dicts): if i % 50 == 0: print(f'processing {i} of {total_docs} documents') doc_dict['text'] = doc_dict['text'].split('\n_____\n\n')[0] try: with open(path_stage0, 'w') as f: json.dump(doc_dict, f) # Stage 1 with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # print(pytextrank.pretty_print(graf)) # Stage 2 graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook # print(pytextrank.pretty_print(rl)) # Stage 3 kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook # print(pytextrank.pretty_print(s._asdict())) # Stage 4 phrase_list = list( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=15) ])) phrases = ", ".join(phrase_list) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) collection.update_one({f'{doctype}_id': { '$eq': doc_dict['id'] }}, {'$set': { 'key_phrases': phrase_list }}) except: failed_ids.append(doc_dict['id']) print('failed on ', doc_dict['id']) continue