def get_data_list(URL, file_type=""): SUMMARY_SENTENCES_COUNT = 5 sentences = [] try: LANGUAGE = "english" # parser = None if file_type == "txt": parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE)) elif file_type == "pdf": content = read_pdf(URL) parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE)) else: parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) document = parser.document stemmer = Stemmer(LANGUAGE) from sumy.summarizers.luhn import LuhnSummarizer LHS = LuhnSummarizer(stemmer) LHS.stop_words = get_stop_words(LANGUAGE) print("\nSummary using Luhn Summarizer") print("*******************************") for sentence in LHS(document, SUMMARY_SENTENCES_COUNT): sentences.append(str(sentence)) except Exception as e: print(str(e)) finally: return sentences
def get_summary(self, text_source: str, num_sentences: int = 5) -> []: # url = "https://www.cbc.ca/news/canada/toronto/skinny-dipping-sharks-ripleys-1.4862945" parser = HtmlParser.from_url(text_source, self.Tokenizer) doc = parser.document return self.Summarizer(doc, num_sentences)
def add_new_entry(): import nltk nltk.download("punkt") from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words urls = [] for url in urls: LANGUAGE = "english" SENTENCES_COUNT = 1 parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) my_summary = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): my_summary.append(sentence) print(my_summary) mongo.db.summaries.insert_one({ "sentence": str(my_summary[0]).split(), "url": url }) #print((str(my_summary[0])).split()) vals = mongo.db["summaries"] cursor = vals.find({}) print({"vals": loads(dumps(cursor))})
def get_doc_summary(html, url): ''' Parse document text and extract summary with summarization algorithms. This is helpful when meta-desc tag is not available ''' from sumy.parsers.html import HtmlParser # from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 3 parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) res = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): res += str(sentence) return res
def url(request): if (request.GET.get('url', 'url').lower() not in ['url', 'image']): url = request.GET.get('url', 'url') print(url) LANGUAGE = "english" SENTENCES_COUNT = 5 out = [] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): out.append(str(sentence)) r = requests.get(url) test = url.split("/") urlval = str('/'.join(test[:3])) data = r.text soup = BeautifulSoup(data, "lxml") temp = [] for link in soup.find_all('img'): image = link.get("src") temp.append(image) for loc, i in enumerate(temp): if (i[0] == "/"): temp[loc] = urlval + temp[loc] return ({'content': str("\n".join(out)) + ' '.join(temp)})
def summarize_article(article, vibe_description_file_path): try: article_url = article['alternate'][0]['href'] article_title = article['title'] article_newspaper = Article(article_url) article_newspaper.download() article_newspaper.parse() article_newspaper.nlp() text_content = article_newspaper.text update_json_file(vibe_description_file_path, 'textContent', text_content) LANGUAGE = 'english' parser = HtmlParser.from_url(article_url, Tokenizer('english')) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) article_summary = [] for sentence in summarizer(parser.document, 3): article_summary.append(sentence._text) except: print('Error summarizing article') return False update_json_file(vibe_description_file_path, 'summary', article_summary) update_json_file(vibe_description_file_path, 'keywords', article_newspaper.keywords) return True
def ExtractivelySummarizeCorpus(self, corpus_path: str, HTML: bool = True, sentence_count: int = 20): if (HTML): self.parser = HtmlParser.from_url(corpus_path, Tokenizer(LANGUAGE)) else: # or for plain text files self.parser = PlaintextParser.from_file(corpus_path, Tokenizer(LANGUAGE)) sentences = self.summarizer(self.parser.document, sentence_count) if (DEBUG): # logger.info("DEBUG::ExtractivelySummarizeCorpus::these are all the parser.document.sentences") # logger.info(self.parser.document.sentences) logger.info( "DEBUG::ExtractivelySummarizeCorpus::top n=%d sentences:" % sentence_count) for sentence in sentences: logger.info(str(sentence)) sentences = [str(sentence) for sentence in sentences] return sentences
def sum_from_url(url, language="english", sentences_cout=100): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stem_words = get_stop_words(language) sentences = summarizer(parser.document, sentences_cout) return sentences
def auto_summarize_comment(request): comment_ids = request.POST.getlist('d_ids[]') sent_list = [] for comment_id in comment_ids: comment = Comment.objects.get(id=comment_id) text = comment.text text = re.sub('<br>', ' ', text) text = re.sub('<BR>', ' ', text) parser = HtmlParser.from_string(text, '', Tokenizer("english")) num_sents = request.GET.get('num_sents', None) if not num_sents: all_sents = parser.tokenize_sentences(text) num_sents = floor(float(len(all_sents))/3.0) sents = summarizer(parser.document, num_sents) for sent in sents: sent_list.append(sent._text) return JsonResponse({"sents": sent_list})
def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = '' for sentence in summarizer(parser.document, SENTENCES_COUNT): result = result + ' ' + str(sentence) try: result = result + ' ' + str(sentence) except: print( '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n' ) sys.stdout.flush() return ( '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n' ) print('\n\n' + str(url) + '\n\n' + str(result)) sys.stdout.flush() return result
def summarizeFile(inputFile): summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") url = findURLS(inputFile) if url != None: if url[-1] == '.': url = url[0:-1] #print (url) #urlContent = 'Summary from URL ['+url+']: \n' urlContent = '' try: parser = HtmlParser.from_url(url, Tokenizer("english")) for sentence in summarizer(parser.document, 3): urlContent = urlContent + str(sentence) + '\n' except: #print (sys.exc_info()[0]) urlContent = '' content = inputFile.read() parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE)) #summarizer = LsaSummarizer(stem_word) #summarizer.stop_words = get_stop_words(LANGUAGE) #summary = 'Event Summary: \n' summary = '' try: for sentence in summarizer(parser.document, SENTENCES_COUNT_1): summary = summary + str(sentence) + '\n' except AssertionError: return None if url != None: return summary + urlContent return summary
def summarize(self, method='luhn'): """ Summarize text """ method = self._check_method(method) if self.url: parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE)) elif self.html: parser = HtmlParser(self.html, Tokenizer(self.LANGUAGE)) stemmer = Stemmer(self.LANGUAGE) summarizer = method(stemmer) summarizer.stop_words = get_stop_words(self.LANGUAGE) sumy = summarizer(parser.document, self.SENTENCES_COUNT) summary = ''.join([str(i) for i in list(sumy)]) return summary
def CreateDataSet(w): try: urls = url(w) for link in urls: if link not in allExtLinks: find_about = link # Create a list of each bit between slashes slashparts = find_about.split('/') dirname = '/'.join(slashparts[:-1]) + '/' if "about" in slashparts: scrapped_about.append(link) print('\n',link) for about in scrapped_about: parser = HtmlParser.from_url(link, Tokenizer("english")) summary = summarizer(parser.document, 2) # print(l, '\n') #saving the summary to a dataframe for sentence in summary: print(sentence, '\n') break else: print('There are no "about" linked pages in this url') except: print('there seem to be an issue with the Url you entered') quit()
def summarize_url(url,summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def summarizer(input_obj, SENTENCES_COUNT=2, op='url'): LANGUAGE = "english" # SENTENCES_COUNT = 1 # url = "https://sea.pcmag.com/smartphones/17424/apple-iphone-x" # text = ' '.join(text.split()) # print(input_obj) # print(type(input_obj)) parser = None if op == 'text': text = input_obj['text'] parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) elif op == 'url': url = input_obj['link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) else: print('OP ERROR') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): # print(sentence) sentences.append(str(sentence)) return sentences # print(get_summarize("https://sea.pcmag.com/smartphones/17424/apple-iphone-x"))
def get_summary(self, summary_length: int = 10) -> Iterator[str]: parser = HtmlParser.from_url(self.link, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, summary_length): yield sentence
def summarize_url(url, summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def get_sentences(url, sentences_count=10): """ Returns the important sentences given a url """ parser = HtmlParser.from_url(url, Tokenizer(language)) sentences = summarizer(parser.document, sentences_count) return sentences
def summarizer(request): inp=request.POST['geturl'] LANGUAGE = "english" SENTENCES_COUNT = 10 url = str(inp) f = open("denemedosyasiU3.txt", "w") f.write(url) f.close() parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) return render(request, 'home.html', {'data1': sentence}) '''
def main(url, num_sentences=10, language='english'): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) for sentence in summarizer(parser.document, num_sentences): print(sentence)
def test_annotated_text(self): path = expand_resource_path("snippets/paragraphs.html") url = "http://www.snippet.org/paragraphs.html" parser = HtmlParser.from_file(path, url, Tokenizer("czech")) document = parser.document self.assertEqual(len(document.paragraphs), 2) self.assertEqual(len(document.paragraphs[0].headings), 1) self.assertEqual(len(document.paragraphs[0].sentences), 1) self.assertEqual(to_unicode(document.paragraphs[0].headings[0]), "Toto je nadpis prvej úrovne") self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]), "Toto je prvý odstavec a to je fajn.") self.assertEqual(len(document.paragraphs[1].headings), 0) self.assertEqual(len(document.paragraphs[1].sentences), 2) self.assertEqual( to_unicode(document.paragraphs[1].sentences[0]), "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.") self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]), "Aj súbory majú predsa city.")
def store_summary(self): for item in self.doc_id_url: if item < len(self.document_info): #soup = self.document_info[item] s = requests.Session() response = s.get(self.doc_id_url[item]) if response.status_code != 404: parser = HtmlParser.from_url(self.doc_id_url[item], Tokenizer("english")) text = "" """ for tag in soup.findAll('p'): text = text + tag.text """ stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") for sentence in summarizer(parser.document, 5): print sentence if item in self.summary: self.summary[item] = self.summary[item] + sentence else: self.summary[item] = sentence
def sumySummary(url): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return [ cleanText(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT) ]
def get_summ(url, func=Summarizer3): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = func(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sumy = summarizer(parser.document, SENTENCES_COUNT) result = [str(i) for i in list(sumy)] return result
def urlDoc_summarize(url): parser = HtmlParser.from_url(url, Tokenizer('english')) stemmer = Stemmer('english') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('english') summary = '' for sentense in summarizer(parser.document, 15): summary += str(sentense) + ' ' return summary
def main(): url = "http://www.spiegel.de/international/europe/as-brexit-nears-harrassment-of-eu-citizens-in-uk-rises-a-1181845.html" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) run_LSA(stemmer, parser.document) run_LexRank(stemmer, parser.document) run_TextRank(stemmer, parser.document) run_Luhn(stemmer, parser.document) run_SumBasic(stemmer, parser.document)
def summarize_url(self, url, sentences=3, language="english"): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) text = " ".join(map(str, summarizer(parser.document, sentences))) return " ".join(text.split())
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def get_summary(url): parser = HtmlParser.from_url(url, Tokenizer('English')) stemmer = Stemmer('English') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('English') print "\nThree Sentence Summary:\n" for sentence in summarizer(parser.document, 3): print sentence
def get_summary(html): parser = HtmlParser.from_string(html, tokenizer=Tokenizer(LANGUAGE), url=None) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def getSentencesFrom(url): lang = "english" try: parser = HtmlParser.from_url(url, Tokenizer(lang)) except: print("HTTP ERROR @ " + url) sentences = list(parser.document.sentences) sentences = map(unicode, sentences) return sentences
def summarize(url): summary = [] parser = HtmlParser.from_url(url,Tokenizer(lang)) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(lang) for sentence in summarizer(parser.document,sent): summary.append(sentence._text) return ' '.join(summary)
def summCallback(self, url2open): parser = HtmlParser.from_url(url2open, Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") self.area.delete("0.0", END) for sentence in summarizer(parser.document, 10): self.area.insert(END, sentence)
def summarize(doc, SENTENCES_COUNT): parser = HtmlParser.from_string(doc, None, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): if str(sentence).strip().startswith("Image copyright") is False: summary += (" " + str(sentence)) return summary
def retreive_sumy(url): # "http://en.wikipedia.org/wiki/Automatic_summarization" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return summarizer(parser.document, SENTENCES_COUNT)
def summarizeUrl(self, url, numSentences = 10): """Summarizes text at a given url to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = HtmlParser.from_url(url, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def summarize(method, length, url): html_content = fetch_url(url) iso_lang = detect_language(html_content) language = SUMY_LANGUAGES[iso_lang] stemmer = Stemmer(language) parser = HtmlParser.from_string(html_content, url, Tokenizer(language)) summarizer_class = AVAILABLE_METHODS[method] summarizer = build_summarizer(summarizer_class, get_stop_words(language), stemmer, parser) sentences = summarizer(parser.document, ItemsCount(length)) summary = ' '.join([unicode(sentence) for sentence in sentences]) return summary, iso_lang
def getText(self, sentence_count=None): if sentence_count: self.SENTENCE_COUNT = sentence_count parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(self.LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANGUAGE) text_list = [] for sentence in summarizer(parser.document, self.SENTENCE_COUNT): text_list.append(str(sentence)) return "\n".join(text_list)
def do(): rows = store.get_row_by_status(1) for row in rows: parser = HtmlParser.from_string(row["content_origin"], row["url"], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = list() for sentence in summarizer(parser.document, SENTENCES_COUNT): sentences.append(str(sentence)) summary = "\n".join(sentences) store.update_row(row["id"], {"summary_origin": summary, "status": 2})
def test_annotated_text(): path = expand_resource_path("snippets/paragraphs.html") url = "http://www.snippet.org/paragraphs.html" parser = HtmlParser.from_file(path, url, Tokenizer("czech")) document = parser.document assert len(document.paragraphs) == 2 assert len(document.paragraphs[0].headings) == 1 assert len(document.paragraphs[0].sentences) == 1 assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne" assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn." assert len(document.paragraphs[1].headings) == 0 assert len(document.paragraphs[1].sentences) == 2 assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru." assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."
def index(): # url = "http://www.dawn.com/news/1216282" # ------------------------------------------------------------------------------- # ------- Need help here ------------------# if request.method == 'POST': url = request.json.get('url') line_count = request.json.get('line_count') # --------------------------------------------------------------------------- parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) print (parser) # stemmer = Stemmer(LANGUAGE) # # summarizer = Summarizer(stemmer) # summarizer.stop_words = get_stop_words(LANGUAGE) # s = "" # for sentence in summarizer(parser.document, SENTENCES_COUNT): # s += str(sentence) return jsonify(dict(message='stuff'))
def _get_summary(self): if self.readable == '': return language = self.language.lower() if language == '': language = 'english' parser = HtmlParser.from_string( self.readable, self.url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) summary = [] for sentence in summarizer(parser.document, 10): if sentence.is_heading: summary.append('<h2>%s</h2>' % (unicode(sentence))) else: summary.append('<p>%s</p>' % (unicode(sentence))) self.summary = ''.join(summary)
def do_work(self, worker_id, work): url = work """Greenlet to fetch analyze URL content """ print '[+] {0}: Starting crawl of {1}'.format(worker_id, url) """Using urllib2 via geventhttpclient. Selenium with PhantomJS or a real browser would be probably better but slower and more expensive. Could have also used scrapy, but thats way to heavy for this use-case.""" body = urlopen(url).read() """Using Sumy (built on nltk) for page summaries since it supports a number of ranking algorithms. It's not perfect though, it was written for czech and so its missing some important English-specific things (e.g. bonus/significant words for Edmundson Summarizers) https://pypi.python.org/pypi/sumy TextBlob might be a better alternative, but it didn't seem to provide overall summary information. https://textblob.readthedocs.org/en/latest/ """ parser = HtmlParser.from_string(body, None, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) words = [] for sentence in summarizer(parser.document, 10): words = str(sentence).split() # Send the results self.work_done(worker_id, words)
def download_sources(summarize=True, sources=currentFeeds): raw_documents = [] complete_urls = [] # Download News Stories converter = html2text.HTML2Text() converter.ignore_links = True converter.ignore_images = True converter.bypass_tables = True count_error = 0 document_count = 0 feed_count = -1 for url in currentFeeds: feed_count += 1 current_feed_document = 0 currentStories = [] feed = feedparser.parse(url[1]) for story in feed.entries: current_feed_document += 1 if story.title.startswith(u'VIDEO:') or story.title.startswith(u'AUDIO'): continue if story.link in complete_urls: continue try: res = requests.get(story.link) html = res.text title = story.title.encode('utf-8') completion = ((feed_count + (current_feed_document / float(len(feed.entries)))) / (float(len(currentFeeds))))* 100 print "[" + ("%.2f" % completion) + "%] \t " + feed.feed.title.encode('utf-8') + " - " + title raw_text = converter.handle(html) if summarize: parser = HtmlParser.from_string(html, None, Tokenizer("english")) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sum_text = [sentence for sentence in summarizer(parser.document, 20)] raw_text = (" ".join([str(sentence) for sentence in sum_text])).decode('utf-8') # print raw_text stats = TextBlob(raw_text) currentStories.append((title, raw_text, story.link, stats.sentiment, story.published_parsed)) complete_urls.append(story.link) document_count += 1 except KeyboardInterrupt: print "Quitting from Keyboard Interrupt." sys.exit(0) except: count_error += 1 print "\t Error occurred while processing that story:", sys.exc_info()[0] traceback.print_exc() raw_documents.append((url[0], currentStories)) print "Received", document_count, "documents with", count_error, "errors" return raw_documents
def analyze_web_site(url): print("Main Points: %s \n" % url) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) analyze(parser)
def summarize(entry, count): clean = lambda sentence: re.sub(r' (?:[;,:.!?])', '', unicode(sentence)) parser = HtmlParser.from_string(entry.content, entry.url, tokenizer) sentences = map(clean, summarizer(parser.document, count)) return '<ul>{}</ul>'.format(''.join( '<li>{}</li>'.format(sentence) for sentence in sentences))
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words # LANGUAGE = "english" LANGUAGE = "czech" SENTENCES_COUNT = 10 if __name__ == "__main__": # parser = PlaintextParser.from_file("yelp1.txt", Tokenizer(LANGUAGE)) url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import lxml.html list_of_pages = ['http://postach.us10.list-manage1.com/track/click?u=819841bd24897de296a130d94&id=1fbd285a11&e=01afa4fcef'] stemmer = Stemmer('English') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('English') if __name__ == "__main__": for url in list_of_pages: parser = HtmlParser.from_url(url, Tokenizer('English')) print(lxml.html.parse(url).find(".//title").text) print(url), for sentence in summarizer(parser.document, 2): print(sentence),
def summarize(url, sent_count=10): "Produces `sent_cout` sentence summaries of `url`." parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) return " ".join([str(sentence) for sentence in summarizer(parser.document, sent_count)])
def getSummaryFromWebsite(url, sentences_count): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) return summarize(parser, sentences_count)