def extract_summary(filename): """Print summary text to stdout.""" with open(filename) as f: summary = textrank.extract_sentences(f.read()) print(summary) print("U R HRRE MAIN")
def get_huffington_data(): url2 = input("Enter a URL:") request2 = urllib2.Request(url2) result2 = urllib2.urlopen(request2) soup2 = BeautifulSoup(result2.read(), 'html.parser') prop_add2 = soup2.find("div", {"class": "post-contents yr-entry-text"}) p2 = prop_add2.text phrases2 = textrank.extract_key_phrases(p2) summaryV2 = textrank.extract_sentences(p2) print("Summary:\n") print(summaryV2) print("phrases:\n") print(phrases2)
def get_toi_data(): url3 = input("Enter a URL:") request3 = urllib2.Request(url3) result3 = urllib2.urlopen(request3) soup3 = BeautifulSoup(result3.read(), 'html.parser') prop_add3 = soup3.find('div', {'class': 'Normal'}) p3 = prop_add3.text phrases3 = textrank.extract_key_phrases(p3) summaryV3 = textrank.extract_sentences(p3) print("Summary:\n") print(summaryV3) print("phrases:\n") print(phrases3)
def get_hindustan_data(): url = input("Enter a URL:") request = urllib2.Request(url) result = urllib2.urlopen(request) soup = BeautifulSoup(result.read(), 'html.parser') prop_add = soup.find('div', {'class': 'story-details'}) p = prop_add.text f = open('t1.txt', 'w+') f.write("{}".format(p), file=text_file) f.close() with open(t1.txt) as g: phrases = textrank.extract_key_phrases(p) summaryV = textrank.extract_sentences(p) print("Summary:\n") print(summaryV) print("phrases:\n") print(phrases)
def create_summary(page): resp = urllib.request.urlopen(page) soup = bs4.BeautifulSoup(resp, 'html.parser') title = (soup.find('h1')).text pgraph = soup.find_all('p') text = "" for x in range((len(pgraph))): if "Use of this site constitutes" not in pgraph[x].text: text = text + pgraph[x].text text = remove_refs(text) summary = textrank.extract_sentences(text) summary = remove_trail(summary, ".") summary = summary + "." print('\n', title, '\n') print(summary)
def get_summary(article): """Takes a cleaned article as input :returns: summry of article (str) """ article = [buzzclean(line) for line in article] sentences = [] sentences = [re.sub("\s+", " ", line) for line in article] sentences = [punkt.tokenize(sentence.lower()) for sentence in sentences] sentences = list(chain.from_iterable(sentences)) article = ' '.join(sentences) summary = textrank.extract_sentences(article, summary_length=150, clean_sentences=True) sentsum = punkt.tokenize(summary) sentsum = [sent.capitalize() for sent in sentsum] summary = ' '.join(sentsum) return summary
def summary(df): lSum = [] print('\nSummarising & Extracting: \n', end='') for i in range(0, len(df)): print('\r', end='') print("Completed: " + str(round((i + 1) / len(df) * 100, 1)) + "%", end="", flush=True) summary = textrank.extract_sentences(df.iloc[i, 0]) kwords = textrank.extract_key_phrases(dfData.iloc[i, 0]) lSum.append([summary, kwords]) dfSummaries = pd.DataFrame(lSum) dfSummaries.columns = ["summaries", "keywords"] return dfSummaries
def textrank(): body = request.get_json() return extract_sentences(body['data'], k=int(body['expectedLen']))
def generate_titles(file_name, random=False, use_rake=False, use_summa_text_rank=False, use_text_rank=False): logger.info("Opening file") text_file = open(file_name) logger.info("Reading file") raw_text = text_file.read().lower() # Remove Unicode characters. raw_text = raw_text.decode('unicode_escape').encode('ascii', 'ignore') #Convert raw text to word tokens logger.info("Tokenizing") tokens = nltk.word_tokenize(raw_text.translate(None, string.punctuation)) #Remove stopwords logger.info("Removing stopwords") stop_words = set(stopwords.words('english')) #NOTE: we need to include some more stopwords, as 'english' doesn't contain some stopwords # related to journal articles (e.g., "et" and "al" in "et al.") stop_words.update(ADDITIONAL_STOPWORDS) filtered_text = [word for word in tokens if word not in stop_words] #Create Corpus object for input text logger.info("Creating corpus object") input_text = Corpus(raw_text, tokens, filtered_text) input_text.stop_words = stop_words logger.info("Filtered words to use") logger.info("\t %s" % input_text.filtered_tokens[:5]) #NOTE: stopwords are removed before POS tags assigned, this could # potentially degrade POS tagging performance - may want to # switch this order #Demonstrate functions logger.info("Getting POS tags") input_text.pos_tags = pos_tagger(input_text) logger.info("\t %s" % input_text.pos_tags[:5]) logger.info("Finding all used parts of speech.") input_text.used_pos = set([tag[1] for tag in input_text.pos_tags]) logger.info(input_text.used_pos) logger.info("Getting stemmed words") input_text.stemmed_words = stem_tokens(input_text) logger.info("\t %s" % input_text.stemmed_words[:5]) # split the stemmed words into ~equal-sized groups logger.info("Splitting the stemmed words into groups") #logger.info("There are %s words in this group" % len(input_text.stemmed_words)) num_splits = 2 input_text.splits = split_tokens(input_text, num_splits) #for s in input_text.splits: # logger.info("%s %s\n\n" % (s,len(s))) logger.info("Getting word frequency and proximity") cutoff = 0.125 if len(input_text.filtered_tokens) < 250: cutoff = 0.35 #33 input_text.word_freq_proximity = stems_frequency_proximity( input_text, cutoff) #logger.info("\t %s" % (input_text.word_freq_proximity[u'becom'],)) logger.info("Mapping filtered words and their stemmed forms") input_text.filtered_word_and_bases, input_text.filtered_bases_and_words = stems_and_bases( input_text) #logger.info("\t %s" % input_text.filtered_word_and_bases[u'becom']) logger.info("Mapping POS tags and words") input_text.pos_tag_and_words = pos_tags_and_words(input_text) #logger.info("\t %s" % input_text.pos_tag_and_words['NNS'][:5]) logger.info("------ End Processing ------\n\n") ########################## if use_rake: logger.info("------ Begin Rake ------") """More information at: https://github.com/fabianvf/python-rake""" r = Rake(RAKE.SmartStopList()) #stop_words_list) sorted_keywords = r.run(input_text.raw_text) logger.info("Sorted keywords: %s" % sorted_keywords[:5]) logger.info("------ End Rake ------\n\n") if use_summa_text_rank: logger.info("------ Begin SummaTextRank ------") """More information at https://github.com/summanlp/textrank""" logger.info("Sentence(s) summary: %s" % summarizer.summarize(raw_text)) logger.info("Keywords: %s" % keywords.keywords(raw_text)) logger.info("------ End SummaTextRank ------\n\n") if use_text_rank: logger.info("------ Begin TextRank ------") """More information at https://github.com/davidadamojr/TextRank""" logger.info("Sentence(s) summary: %s " % textrank.extract_sentences(raw_text)) logger.info("Keywords: %s" % textrank.extract_key_phrases(raw_text)) logger.info("------ End TextRank ------\n\n") ########################## logger.info("------ Begin Weighting ------") logger.info("Calculating word weights") input_text.word_weights = get_word_weights(input_text, random) logger.info("Printing word weights") weight_thresh = -1 print_words_with_weight_above(weight_thresh, input_text.word_weights, input_text) logger.info("------ End Weighting ------\n\n") ########################## logger.info("------ Begin Building ------") titles = build_titles(input_text) logger.info("------ End Building ------\n\n") ########################## logger.info("Closing file") text_file.close() ########################## logger.info("------ Begin Ranking ------") #NOTE: the scores denote the title rankings relative to one another # 1 denotes the title with the highest rank and 0 denotes the # title with the lowest rank (determined by a combination of # summed word weights and average word weight) titles_ranked = order_titles(titles, input_text) logger.info("------ End Ranking ------\n\n") ########################## return titles_ranked
def __extract_sentences(text): sentences = textrank.extract_sentences(text) return sentences
def printSummaries(collection): for ind, text in enumerate(collection): print 'Sample {:d}'.format(ind) print textrank.extract_sentences(text, summary_length=200) #print textrank.extract_key_phrases(text) print ''
def getSummaries(collection): return [(row.id, textrank.extract_sentences(row.text, summary_length=200)) for row in collection.itertuples()]
def extract_summary(filename): fr = open("summarise.txt", "w") with open(filename) as f: summary = textrank.extract_sentences(f.read()) fr.write(summary)
def extract_summary(filename): """Print summary text to stdout.""" with open(filename) as f: summary = textrank.extract_sentences(f.read()) print(summary)
import textrank textrank.setup_environment() filename = 'tests/ari/editorial.txt' with open(filename) as f: summary = textrank.extract_sentences(f.read(), 108) print(summary) text_file = open("tests/ari/Output.txt", "w") text_file.write(summary) text_file.close()