def add_new_entry(): import nltk nltk.download("punkt") from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words urls = [] for url in urls: LANGUAGE = "english" SENTENCES_COUNT = 1 parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) my_summary = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): my_summary.append(sentence) print(my_summary) mongo.db.summaries.insert_one({ "sentence": str(my_summary[0]).split(), "url": url }) #print((str(my_summary[0])).split()) vals = mongo.db["summaries"] cursor = vals.find({}) print({"vals": loads(dumps(cursor))})
def get_summary(self, text_source: str, num_sentences: int = 5) -> []: # url = "https://www.cbc.ca/news/canada/toronto/skinny-dipping-sharks-ripleys-1.4862945" parser = HtmlParser.from_url(text_source, self.Tokenizer) doc = parser.document return self.Summarizer(doc, num_sentences)
def summarize_article(article, vibe_description_file_path): try: article_url = article['alternate'][0]['href'] article_title = article['title'] article_newspaper = Article(article_url) article_newspaper.download() article_newspaper.parse() article_newspaper.nlp() text_content = article_newspaper.text update_json_file(vibe_description_file_path, 'textContent', text_content) LANGUAGE = 'english' parser = HtmlParser.from_url(article_url, Tokenizer('english')) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) article_summary = [] for sentence in summarizer(parser.document, 3): article_summary.append(sentence._text) except: print('Error summarizing article') return False update_json_file(vibe_description_file_path, 'summary', article_summary) update_json_file(vibe_description_file_path, 'keywords', article_newspaper.keywords) return True
def summarize_url(url,summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = '' for sentence in summarizer(parser.document, SENTENCES_COUNT): result = result + ' ' + str(sentence) try: result = result + ' ' + str(sentence) except: print( '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n' ) sys.stdout.flush() return ( '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n' ) print('\n\n' + str(url) + '\n\n' + str(result)) sys.stdout.flush() return result
def sum_from_url(url, language="english", sentences_cout=100): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stem_words = get_stop_words(language) sentences = summarizer(parser.document, sentences_cout) return sentences
def get_summary(self, summary_length: int = 10) -> Iterator[str]: parser = HtmlParser.from_url(self.link, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, summary_length): yield sentence
def store_summary(self): for item in self.doc_id_url: if item < len(self.document_info): #soup = self.document_info[item] s = requests.Session() response = s.get(self.doc_id_url[item]) if response.status_code != 404: parser = HtmlParser.from_url(self.doc_id_url[item], Tokenizer("english")) text = "" """ for tag in soup.findAll('p'): text = text + tag.text """ stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") for sentence in summarizer(parser.document, 5): print sentence if item in self.summary: self.summary[item] = self.summary[item] + sentence else: self.summary[item] = sentence
def summarizeFile(inputFile): summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") url = findURLS(inputFile) if url != None: if url[-1] == '.': url = url[0:-1] #print (url) #urlContent = 'Summary from URL ['+url+']: \n' urlContent = '' try: parser = HtmlParser.from_url(url, Tokenizer("english")) for sentence in summarizer(parser.document, 3): urlContent = urlContent + str(sentence) + '\n' except: #print (sys.exc_info()[0]) urlContent = '' content = inputFile.read() parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE)) #summarizer = LsaSummarizer(stem_word) #summarizer.stop_words = get_stop_words(LANGUAGE) #summary = 'Event Summary: \n' summary = '' try: for sentence in summarizer(parser.document, SENTENCES_COUNT_1): summary = summary + str(sentence) + '\n' except AssertionError: return None if url != None: return summary + urlContent return summary
def ExtractivelySummarizeCorpus(self, corpus_path: str, HTML: bool = True, sentence_count: int = 20): if (HTML): self.parser = HtmlParser.from_url(corpus_path, Tokenizer(LANGUAGE)) else: # or for plain text files self.parser = PlaintextParser.from_file(corpus_path, Tokenizer(LANGUAGE)) sentences = self.summarizer(self.parser.document, sentence_count) if (DEBUG): # logger.info("DEBUG::ExtractivelySummarizeCorpus::these are all the parser.document.sentences") # logger.info(self.parser.document.sentences) logger.info( "DEBUG::ExtractivelySummarizeCorpus::top n=%d sentences:" % sentence_count) for sentence in sentences: logger.info(str(sentence)) sentences = [str(sentence) for sentence in sentences] return sentences
def get_sentences(url, sentences_count=10): """ Returns the important sentences given a url """ parser = HtmlParser.from_url(url, Tokenizer(language)) sentences = summarizer(parser.document, sentences_count) return sentences
def get_data_list(URL, file_type=""): SUMMARY_SENTENCES_COUNT = 5 sentences = [] try: LANGUAGE = "english" # parser = None if file_type == "txt": parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE)) elif file_type == "pdf": content = read_pdf(URL) parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE)) else: parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) document = parser.document stemmer = Stemmer(LANGUAGE) from sumy.summarizers.luhn import LuhnSummarizer LHS = LuhnSummarizer(stemmer) LHS.stop_words = get_stop_words(LANGUAGE) print("\nSummary using Luhn Summarizer") print("*******************************") for sentence in LHS(document, SUMMARY_SENTENCES_COUNT): sentences.append(str(sentence)) except Exception as e: print(str(e)) finally: return sentences
def main(url, num_sentences=10, language='english'): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) for sentence in summarizer(parser.document, num_sentences): print(sentence)
def summarize_url(url, summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def summarizer(input_obj, SENTENCES_COUNT=2, op='url'): LANGUAGE = "english" # SENTENCES_COUNT = 1 # url = "https://sea.pcmag.com/smartphones/17424/apple-iphone-x" # text = ' '.join(text.split()) # print(input_obj) # print(type(input_obj)) parser = None if op == 'text': text = input_obj['text'] parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) elif op == 'url': url = input_obj['link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) else: print('OP ERROR') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): # print(sentence) sentences.append(str(sentence)) return sentences # print(get_summarize("https://sea.pcmag.com/smartphones/17424/apple-iphone-x"))
def summarizer(request): inp=request.POST['geturl'] LANGUAGE = "english" SENTENCES_COUNT = 10 url = str(inp) f = open("denemedosyasiU3.txt", "w") f.write(url) f.close() parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) return render(request, 'home.html', {'data1': sentence}) '''
def url(request): if (request.GET.get('url', 'url').lower() not in ['url', 'image']): url = request.GET.get('url', 'url') print(url) LANGUAGE = "english" SENTENCES_COUNT = 5 out = [] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): out.append(str(sentence)) r = requests.get(url) test = url.split("/") urlval = str('/'.join(test[:3])) data = r.text soup = BeautifulSoup(data, "lxml") temp = [] for link in soup.find_all('img'): image = link.get("src") temp.append(image) for loc, i in enumerate(temp): if (i[0] == "/"): temp[loc] = urlval + temp[loc] return ({'content': str("\n".join(out)) + ' '.join(temp)})
def CreateDataSet(w): try: urls = url(w) for link in urls: if link not in allExtLinks: find_about = link # Create a list of each bit between slashes slashparts = find_about.split('/') dirname = '/'.join(slashparts[:-1]) + '/' if "about" in slashparts: scrapped_about.append(link) print('\n',link) for about in scrapped_about: parser = HtmlParser.from_url(link, Tokenizer("english")) summary = summarizer(parser.document, 2) # print(l, '\n') #saving the summary to a dataframe for sentence in summary: print(sentence, '\n') break else: print('There are no "about" linked pages in this url') except: print('there seem to be an issue with the Url you entered') quit()
def sumySummary(url): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return [ cleanText(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT) ]
def get_summ(url, func=Summarizer3): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = func(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sumy = summarizer(parser.document, SENTENCES_COUNT) result = [str(i) for i in list(sumy)] return result
def get_summary(url): parser = HtmlParser.from_url(url, Tokenizer('English')) stemmer = Stemmer('English') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('English') print "\nThree Sentence Summary:\n" for sentence in summarizer(parser.document, 3): print sentence
def summarize_url(self, url, sentences=3, language="english"): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) text = " ".join(map(str, summarizer(parser.document, sentences))) return " ".join(text.split())
def summarize(url): summary = [] parser = HtmlParser.from_url(url,Tokenizer(lang)) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(lang) for sentence in summarizer(parser.document,sent): summary.append(sentence._text) return ' '.join(summary)
def getSentencesFrom(url): lang = "english" try: parser = HtmlParser.from_url(url, Tokenizer(lang)) except: print("HTTP ERROR @ " + url) sentences = list(parser.document.sentences) sentences = map(unicode, sentences) return sentences
def main(): url = "http://www.spiegel.de/international/europe/as-brexit-nears-harrassment-of-eu-citizens-in-uk-rises-a-1181845.html" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) run_LSA(stemmer, parser.document) run_LexRank(stemmer, parser.document) run_TextRank(stemmer, parser.document) run_Luhn(stemmer, parser.document) run_SumBasic(stemmer, parser.document)
def urlDoc_summarize(url): parser = HtmlParser.from_url(url, Tokenizer('english')) stemmer = Stemmer('english') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('english') summary = '' for sentense in summarizer(parser.document, 15): summary += str(sentense) + ' ' return summary
def summCallback(self, url2open): parser = HtmlParser.from_url(url2open, Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") self.area.delete("0.0", END) for sentence in summarizer(parser.document, 10): self.area.insert(END, sentence)
def retreive_sumy(url): # "http://en.wikipedia.org/wiki/Automatic_summarization" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return summarizer(parser.document, SENTENCES_COUNT)
def do_stuff(): summary_final = "" parser = HtmlParser.from_url(url, Tokenizer(Config.sumy_lang)) stemmer = Stemmer(Config.sumy_lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(Config.sumy_lang) for sentence in summarizer(parser.document, Config.sumy_num_sentences): summary_final = summary_final + " " + str(sentence) return summary_final
def summarize(url, sent_count=10): """Automatic text summarizer https://pypi.python.org/pypi/sumy """ lang = "english" parser = HtmlParser.from_url(url, Tokenizer(lang)) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(lang) summary = [str(sent) for sent in summarizer(parser.document, sent_count)] return (summary)
def summarizeUrl(self, url, numSentences = 10): """Summarizes text at a given url to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = HtmlParser.from_url(url, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def summarize_url(url): try: parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) except: return ["" * SENTENCES_COUNT] stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) outs = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): outs.append(str(sentence)) return outs
def summarize(url,sent_len = SENTENCES_COUNT): #url = "https://en.wikipedia.org/wiki/Automatic_summarization" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return {'summary':[str(sentence) for sentence in summarizer(parser.document, sent_len)]}
def readSum(first, second, third): #response = requests.get(url+"play?fname="+str(first)+str(second)+str(third)) parser = HtmlParser.from_url( url + "play?fname=" + str(first) + str(second) + str(third), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) msg = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): msg = msg + sentence.__str__() return statement(msg)
def getText(self, sentence_count=None): if sentence_count: self.SENTENCE_COUNT = sentence_count parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(self.LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANGUAGE) text_list = [] for sentence in summarizer(parser.document, self.SENTENCE_COUNT): text_list.append(str(sentence)) return "\n".join(text_list)
def index(): # url = "http://www.dawn.com/news/1216282" # ------------------------------------------------------------------------------- # ------- Need help here ------------------# if request.method == 'POST': url = request.json.get('url') line_count = request.json.get('line_count') # --------------------------------------------------------------------------- parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) print (parser) # stemmer = Stemmer(LANGUAGE) # # summarizer = Summarizer(stemmer) # summarizer.stop_words = get_stop_words(LANGUAGE) # s = "" # for sentence in summarizer(parser.document, SENTENCES_COUNT): # s += str(sentence) return jsonify(dict(message='stuff'))
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words # LANGUAGE = "english" LANGUAGE = "czech" SENTENCES_COUNT = 10 if __name__ == "__main__": # parser = PlaintextParser.from_file("yelp1.txt", Tokenizer(LANGUAGE)) url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def summarize(url, sent_count=10): "Produces `sent_cout` sentence summaries of `url`." parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) return " ".join([str(sentence) for sentence in summarizer(parser.document, sent_count)])
def getSummaryFromWebsite(url, sentences_count): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) return summarize(parser, sentences_count)
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import lxml.html list_of_pages = ['http://postach.us10.list-manage1.com/track/click?u=819841bd24897de296a130d94&id=1fbd285a11&e=01afa4fcef'] stemmer = Stemmer('English') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('English') if __name__ == "__main__": for url in list_of_pages: parser = HtmlParser.from_url(url, Tokenizer('English')) print(lxml.html.parse(url).find(".//title").text) print(url), for sentence in summarizer(parser.document, 2): print(sentence),
def analyze_web_site(url): print("Main Points: %s \n" % url) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) analyze(parser)