def summarize(text): if isvalid(text): all_capital = False # to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on if text.upper() == text: text = text.lower() all_capital = True if PY2: parser = PlaintextParser.from_string( text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE)) else: parser = PlaintextParser.from_string( text.encode().decode('ascii', errors='ignore'), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [ str(s) for s in summarizer(parser.document, sentences_count=n_sentences) ] if all_capital: output_sentences = ' '.join(sentences).upper() all_capital = False else: output_sentences = ' '.join(sentences) return output_sentences else: return ''
def sum_from_string(string, language="english", sentences_cout=100): parser = PlaintextParser.from_string(string, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stem_words = get_stop_words(language) sentences = summarizer(parser.document, sentences_cout) return sentences
def get_doc_summary(html, url): ''' Parse document text and extract summary with summarization algorithms. This is helpful when meta-desc tag is not available ''' from sumy.parsers.html import HtmlParser # from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 3 parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) res = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): res += str(sentence) return res
def textSummary(data, SENTENCES_COUNT): LANGUAGE = "english" parser = PlaintextParser.from_string(data, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) x = '' for sentence in summarizer(parser.document, SENTENCES_COUNT): x += ' {}'.format(str(sentence)) return x
def summarize(url, sent_count=10): """Automatic text summarizer https://pypi.python.org/pypi/sumy """ lang = "english" parser = HtmlParser.from_url(url, Tokenizer(lang)) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(lang) summary = [str(sent) for sent in summarizer(parser.document, sent_count)] return (summary)
def summarize(text): parser = PlaintextParser.from_string(text.decode( 'ascii', errors='ignore'), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [str(s) for s in summarizer( parser.document, sentences_count=n_sentences)] return ' '.join(sentences)
def run_summarizer(parser, sentences, language='english'): """ :params parser: Parser for selected document type :params sentences: Maximum sentences for summarizer. :returns summary: Summarized page. """ summarizer = Summarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return [ str(sentence) for sentence in summarizer(parser.document, sentences) ]
def summarizer(parser, sentences, language='english'): """ :params parser: Parser for selected document type :params sentences: Maximum sentences for summarizer. :returns summary: Summarized page. """ stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) output = [ str(sentence) for sentence in summarizer(parser.document, sentences) ] return ' '.join(output)
def apply(text, interest, top_k=5): """ Return and tuple of list of tuple. The first list contains all the sentence about the interest and there corresponding weigth in the document The second list contains the top_k sentences of the document """ LANGUAGE = "english" parser = PlaintextParser(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sent_importance = summarizer.rate_sentences(parser.document) interesting_sent = [] for sent in sent_importance: if interest.lower() in sent._text.lower(): interesting_sent.append((sent._text, sent_importance[sent])) top_sent = summarizer(parser.document, top_k) top_sent = [(s._text, sent_importance[s]) for s in top_sent] return (interesting_sent, top_sent)
def main(): LANGUAGE = "english" SENTENCES_COUNT = 2 stop = set(stopwords.words('english')) #retrieve each of the articles articles = os.listdir("../data/articles") count = 0 for article in articles: stdout.write("\rProgress: {:02.0f}%".format( float(count) / len(articles) * 100)) stdout.flush() # print 'Reading articles/' + article # articleFile = io.open('articles/' + article, 'r') parser = PlaintextParser.from_file( os.path.abspath(os.path.join("../data/articles", article)), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = "" file_name = os.path.splitext(article)[0].split('.')[0] for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += str(sentence) summary_tokens = [ token.lower().translate(None, punctuation) for token in word_tokenize(summary) if token not in punctuation and token.lower() not in stop and token != "'s" ] with open(os.path.join("results", file_name + ".txt"), "w") as keywords_file: keywords_file.write('\n'.join(set(summary_tokens))) count += 1 print "\nDone..."
def summarize(): final = [] # Checking the integrity of the url query url = request.args.get('url') if (url == None or url == ""): return abort(400) # Checking the integrity of the num query try: num = int(request.args.get('num')) num = MIN_SENTENCES_COUNT if num < MIN_SENTENCES_COUNT else num num = MAX_SENTENCES_COUNT if num > MAX_SENTENCES_COUNT else num except (ValueError, TypeError) as e: num = MIN_SENTENCES_COUNT # Handles error where url is not a valid url try: parser = Parser.from_url(url, Tokenizer(LANGUAGE)) except (requests.exceptions.MissingSchema, requests.exceptions.HTTPError) as e: try: parser = Parser.from_url("http://" + url, Tokenizer(LANGUAGE)) except: return "URL is not valid.", 403 stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) # Take each sentence and append for sentence in summarizer(parser.document, num): # unidecode takes unicode characters and converts it into ASCII final.append(unidecode(str(sentence))) return json.dumps({"title": parser.get_title(), "content": final})
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import nltk #nltk.download() LANGUAGE = "english" SENTENCES_COUNT = 1 if __name__ == "__main__": url = "http://www.businessinsider.in/Heres-a-super-quick-guide-to-what-traders-are-talking-about-right-now/articleshow/59387381.cms" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print (parser.document) for sentence in summarizer(parser.document, SENTENCES_COUNT): print (sentence)