Example #1
0
def get_data_list(URL, file_type=""):
    SUMMARY_SENTENCES_COUNT = 5
    sentences = []
    try:
        LANGUAGE = "english"
        # parser = None
        if file_type == "txt":
            parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE))
        elif file_type == "pdf":
            content = read_pdf(URL)
            parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE))
        else:
            parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))

        document = parser.document
        stemmer = Stemmer(LANGUAGE)

        from sumy.summarizers.luhn import LuhnSummarizer

        LHS = LuhnSummarizer(stemmer)
        LHS.stop_words = get_stop_words(LANGUAGE)
        print("\nSummary using Luhn Summarizer")
        print("*******************************")
        for sentence in LHS(document, SUMMARY_SENTENCES_COUNT):
            sentences.append(str(sentence))
    except Exception as e:
        print(str(e))
    finally:
        return sentences
Example #2
0
    def get_summary(self, text_source: str, num_sentences: int = 5) -> []:
        # url = "https://www.cbc.ca/news/canada/toronto/skinny-dipping-sharks-ripleys-1.4862945"
        parser = HtmlParser.from_url(text_source, self.Tokenizer)

        doc = parser.document

        return self.Summarizer(doc, num_sentences)
Example #3
0
def add_new_entry():
    import nltk
    nltk.download("punkt")
    from sumy.parsers.html import HtmlParser
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words
    urls = []
    for url in urls:
        LANGUAGE = "english"
        SENTENCES_COUNT = 1
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        my_summary = []
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            my_summary.append(sentence)
        print(my_summary)

        mongo.db.summaries.insert_one({
            "sentence": str(my_summary[0]).split(),
            "url": url
        })

        #print((str(my_summary[0])).split())
    vals = mongo.db["summaries"]
    cursor = vals.find({})
    print({"vals": loads(dumps(cursor))})
Example #4
0
def get_doc_summary(html, url):
    '''
    Parse document text and extract summary with summarization 
    algorithms. This is helpful when meta-desc tag is not available
    '''
    from sumy.parsers.html import HtmlParser
    # from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    LANGUAGE = "english"
    SENTENCES_COUNT = 3

    parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    res = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        res += str(sentence)
    return res
def url(request):
    if (request.GET.get('url', 'url').lower() not in ['url', 'image']):
        url = request.GET.get('url', 'url')
        print(url)
        LANGUAGE = "english"
        SENTENCES_COUNT = 5
        out = []
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            out.append(str(sentence))
        r = requests.get(url)
        test = url.split("/")
        urlval = str('/'.join(test[:3]))
        data = r.text
        soup = BeautifulSoup(data, "lxml")
        temp = []
        for link in soup.find_all('img'):
            image = link.get("src")
            temp.append(image)
        for loc, i in enumerate(temp):
            if (i[0] == "/"):
                temp[loc] = urlval + temp[loc]
        return ({'content': str("\n".join(out)) + '  '.join(temp)})
def summarize_article(article, vibe_description_file_path):
    try:
        article_url = article['alternate'][0]['href']
        article_title = article['title']

        article_newspaper = Article(article_url)
        article_newspaper.download()
        article_newspaper.parse()
        article_newspaper.nlp()

        text_content = article_newspaper.text
        update_json_file(vibe_description_file_path, 'textContent',
                         text_content)

        LANGUAGE = 'english'
        parser = HtmlParser.from_url(article_url, Tokenizer('english'))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        article_summary = []
        for sentence in summarizer(parser.document, 3):
            article_summary.append(sentence._text)

    except:
        print('Error summarizing article')
        return False

    update_json_file(vibe_description_file_path, 'summary', article_summary)
    update_json_file(vibe_description_file_path, 'keywords',
                     article_newspaper.keywords)

    return True
Example #7
0
    def ExtractivelySummarizeCorpus(self,
                                    corpus_path: str,
                                    HTML: bool = True,
                                    sentence_count: int = 20):

        if (HTML):
            self.parser = HtmlParser.from_url(corpus_path, Tokenizer(LANGUAGE))
        else:
            # or for plain text files
            self.parser = PlaintextParser.from_file(corpus_path,
                                                    Tokenizer(LANGUAGE))

        sentences = self.summarizer(self.parser.document, sentence_count)

        if (DEBUG):
            # logger.info("DEBUG::ExtractivelySummarizeCorpus::these are all the parser.document.sentences")
            # logger.info(self.parser.document.sentences)
            logger.info(
                "DEBUG::ExtractivelySummarizeCorpus::top n=%d sentences:" %
                sentence_count)
            for sentence in sentences:
                logger.info(str(sentence))
        sentences = [str(sentence) for sentence in sentences]

        return sentences
Example #8
0
def sum_from_url(url, language="english", sentences_cout=100):
    parser = HtmlParser.from_url(url, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)
    sentences = summarizer(parser.document, sentences_cout)
    return sentences
Example #9
0
def auto_summarize_comment(request):
    
    comment_ids = request.POST.getlist('d_ids[]')
    
    sent_list = []
    
    for comment_id in comment_ids:
        comment = Comment.objects.get(id=comment_id)
        text = comment.text
        
        text = re.sub('<br>', ' ', text)
        text = re.sub('<BR>', ' ', text)
        
        parser = HtmlParser.from_string(text, '', Tokenizer("english"))
        
        num_sents = request.GET.get('num_sents', None)
        if not num_sents:
            all_sents = parser.tokenize_sentences(text)
            num_sents = floor(float(len(all_sents))/3.0)
        
        sents = summarizer(parser.document, num_sents)
         
        
        for sent in sents:
            sent_list.append(sent._text)
     
    return JsonResponse({"sents": sent_list})
def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    result = ''
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result = result + ' ' + str(sentence)
        try:
            result = result + ' ' + str(sentence)

        except:
            print(
                '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n'
            )
            sys.stdout.flush()
            return (
                '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n'
            )
    print('\n\n' + str(url) + '\n\n' + str(result))
    sys.stdout.flush()
    return result
Example #11
0
def summarizeFile(inputFile):
	summarizer = LsaSummarizer(stem_word)
	summarizer.stop_words = get_stop_words("english")
	url = findURLS(inputFile)
	if url != None:
		if url[-1] == '.':
			url = url[0:-1]
		#print (url)
		#urlContent = 'Summary from URL ['+url+']: \n'
		urlContent = ''
		try:
			parser = HtmlParser.from_url(url, Tokenizer("english"))		
			for sentence in summarizer(parser.document, 3):
				urlContent = urlContent + str(sentence) + '\n'
		except:
			#print (sys.exc_info()[0])
			urlContent = ''
	content = inputFile.read()
	parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
	#summarizer = LsaSummarizer(stem_word)
	#summarizer.stop_words = get_stop_words(LANGUAGE)
	#summary = 'Event Summary: \n'
	summary = ''
	try:
		for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
			summary = summary + str(sentence) + '\n'
	except AssertionError:
		return None
	if url != None:
		return summary + urlContent
	return summary
Example #12
0
    def summarize(self, method='luhn'):
        """ Summarize text """

        method = self._check_method(method)

        if self.url:
            parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE))
        elif self.html:
            parser = HtmlParser(self.html, Tokenizer(self.LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = method(stemmer)
        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        sumy = summarizer(parser.document, self.SENTENCES_COUNT)
        summary = ''.join([str(i) for i in list(sumy)])

        return summary
Example #13
0
def CreateDataSet(w):
    try:
        urls = url(w) 
        for link in urls:
            if link not in allExtLinks:
                find_about = link
                # Create a list of each bit between slashes
                slashparts = find_about.split('/')
                dirname = '/'.join(slashparts[:-1]) + '/'
                if "about" in slashparts:
                    scrapped_about.append(link)
                    print('\n',link)

                    for about in  scrapped_about:   
                        parser = HtmlParser.from_url(link, Tokenizer("english"))
                        summary = summarizer(parser.document, 2)
        #                 print(l, '\n')
                        #saving the summary to a dataframe
                        for sentence in summary:        
                            print(sentence, '\n')
                        break 
        else:
            print('There are no "about" linked pages in this url')
    except:
        print('there seem to be an issue with the Url you entered')
        quit()
Example #14
0
def summarize_url(url,summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
Example #15
0
def summarizer(input_obj, SENTENCES_COUNT=2, op='url'):
    LANGUAGE = "english"
    # SENTENCES_COUNT = 1
    # url =  "https://sea.pcmag.com/smartphones/17424/apple-iphone-x"

    # text = ' '.join(text.split())
    # print(input_obj)
    # print(type(input_obj))
    parser = None
    if op == 'text':
        text = input_obj['text']
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif op == 'url':
        url = input_obj['link']
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    else:
        print('OP ERROR')
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        # print(sentence)
        sentences.append(str(sentence))
    return sentences


# print(get_summarize("https://sea.pcmag.com/smartphones/17424/apple-iphone-x"))
Example #16
0
 def get_summary(self, summary_length: int = 10) -> Iterator[str]:
     parser = HtmlParser.from_url(self.link, Tokenizer(LANGUAGE))
     stemmer = Stemmer(LANGUAGE)
     summarizer = Summarizer(stemmer)
     summarizer.stop_words = get_stop_words(LANGUAGE)
     for sentence in summarizer(parser.document, summary_length):
         yield sentence
Example #17
0
def summarize_url(url, summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
Example #18
0
def get_sentences(url, sentences_count=10):
    """
    Returns the important sentences given a url
    """
    parser = HtmlParser.from_url(url, Tokenizer(language))
    sentences = summarizer(parser.document, sentences_count)
    return sentences
def summarizer(request):
    inp=request.POST['geturl']

    LANGUAGE = "english"
    SENTENCES_COUNT = 10

    url = str(inp)

    f = open("denemedosyasiU3.txt", "w")

    f.write(url)

    f.close()

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        return render(request, 'home.html', {'data1': sentence})


    '''
Example #20
0
def main(url, num_sentences=10, language='english'):
	parser = HtmlParser.from_url(url, Tokenizer(language))
	stemmer = Stemmer(language)
	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(language)
	for sentence in summarizer(parser.document, num_sentences):
		print(sentence)
    def test_annotated_text(self):
        path = expand_resource_path("snippets/paragraphs.html")
        url = "http://www.snippet.org/paragraphs.html"
        parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

        document = parser.document

        self.assertEqual(len(document.paragraphs), 2)

        self.assertEqual(len(document.paragraphs[0].headings), 1)
        self.assertEqual(len(document.paragraphs[0].sentences), 1)

        self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
                         "Toto je nadpis prvej úrovne")
        self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
                         "Toto je prvý odstavec a to je fajn.")

        self.assertEqual(len(document.paragraphs[1].headings), 0)
        self.assertEqual(len(document.paragraphs[1].sentences), 2)

        self.assertEqual(
            to_unicode(document.paragraphs[1].sentences[0]),
            "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.")
        self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]),
                         "Aj súbory majú predsa city.")
Example #22
0
    def store_summary(self):
        
        for item in self.doc_id_url:
            if item < len(self.document_info):
                #soup = self.document_info[item]
                s = requests.Session()
                response = s.get(self.doc_id_url[item])
                if response.status_code != 404:
                    parser = HtmlParser.from_url(self.doc_id_url[item], Tokenizer("english"))
                    text = ""
                    """
                    for tag in soup.findAll('p'):
                        text = text + tag.text
                    """
                    stemmer = Stemmer("english")

                    summarizer = Summarizer(stemmer)
                    summarizer.stop_words = get_stop_words("english")
                    
                    for sentence in summarizer(parser.document, 5):
                        print sentence
                        if item in self.summary:
                            self.summary[item] = self.summary[item] + sentence
                        else:
                            self.summary[item] = sentence
def sumySummary(url):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    return [
        cleanText(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)
    ]
def get_summ(url, func=Summarizer3):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = func(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sumy = summarizer(parser.document, SENTENCES_COUNT)
    result = [str(i) for i in list(sumy)]
    return result
Example #25
0
def urlDoc_summarize(url):
    parser = HtmlParser.from_url(url, Tokenizer('english'))
    stemmer = Stemmer('english')
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words('english')
    summary = ''
    for sentense in summarizer(parser.document, 15):
        summary += str(sentense) + ' '
    return summary
Example #26
0
def main():
    url = "http://www.spiegel.de/international/europe/as-brexit-nears-harrassment-of-eu-citizens-in-uk-rises-a-1181845.html"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    run_LSA(stemmer, parser.document)
    run_LexRank(stemmer, parser.document)
    run_TextRank(stemmer, parser.document)
    run_Luhn(stemmer, parser.document)
    run_SumBasic(stemmer, parser.document)
Example #27
0
    def summarize_url(self, url, sentences=3, language="english"):
        parser = HtmlParser.from_url(url, Tokenizer(language))
        stemmer = Stemmer(language)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(language)

        text = " ".join(map(str, summarizer(parser.document, sentences)))
        return " ".join(text.split())
Example #28
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
Example #29
0
def get_summary(url):
    parser = HtmlParser.from_url(url, Tokenizer('English'))
    stemmer = Stemmer('English')
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words('English')

    print "\nThree Sentence Summary:\n"
    for sentence in summarizer(parser.document, 3):
        print sentence
Example #30
0
def get_summary(html):
    parser = HtmlParser.from_string(html, tokenizer=Tokenizer(LANGUAGE), url=None)
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Example #31
0
def getSentencesFrom(url):
    lang = "english"
    try:
        parser = HtmlParser.from_url(url, Tokenizer(lang))
    except:
        print("HTTP ERROR @ " + url)
    sentences = list(parser.document.sentences)
    sentences = map(unicode, sentences)
    return sentences
Example #32
0
def summarize(url):
    summary = []
    parser = HtmlParser.from_url(url,Tokenizer(lang))
    stemmer = Stemmer(lang)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    for sentence in summarizer(parser.document,sent):
        summary.append(sentence._text)
    return ' '.join(summary)
Example #33
0
    def summCallback(self, url2open):
        parser = HtmlParser.from_url(url2open, Tokenizer("english"))
        stemmer = Stemmer("english")

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words("english")

        self.area.delete("0.0", END)
        for sentence in summarizer(parser.document, 10):
            self.area.insert(END, sentence)
Example #34
0
def summarize(doc, SENTENCES_COUNT):
    parser = HtmlParser.from_string(doc, None, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summary = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        if str(sentence).strip().startswith("Image copyright") is False:
            summary += (" " + str(sentence))
    return summary
Example #35
0
def retreive_sumy(url):
    # "http://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)	
    return summarizer(parser.document, SENTENCES_COUNT)
Example #36
0
 def summarizeUrl(self, url, numSentences = 10):
     """Summarizes text at a given url to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = HtmlParser.from_url(url, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
     
Example #37
0
def summarize(method, length, url):
    html_content = fetch_url(url)
    iso_lang = detect_language(html_content)
    language = SUMY_LANGUAGES[iso_lang]
    stemmer = Stemmer(language)
    parser = HtmlParser.from_string(html_content, url, Tokenizer(language))

    summarizer_class = AVAILABLE_METHODS[method]
    summarizer = build_summarizer(summarizer_class, get_stop_words(language), stemmer, parser)

    sentences = summarizer(parser.document, ItemsCount(length))
    summary = ' '.join([unicode(sentence) for sentence in sentences])
    return summary, iso_lang
Example #38
0
    def getText(self, sentence_count=None):
        if sentence_count:
            self.SENTENCE_COUNT = sentence_count
        parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        text_list = []

        for sentence  in summarizer(parser.document, self.SENTENCE_COUNT):
            text_list.append(str(sentence))
        return "\n".join(text_list)
Example #39
0
def do():
    rows = store.get_row_by_status(1)

    for row in rows:
        parser = HtmlParser.from_string(row["content_origin"], row["url"], Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = list()

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentences.append(str(sentence))

        summary = "\n".join(sentences)

        store.update_row(row["id"], {"summary_origin": summary, "status": 2})
Example #40
0
def test_annotated_text():
    path = expand_resource_path("snippets/paragraphs.html")
    url = "http://www.snippet.org/paragraphs.html"
    parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 2

    assert len(document.paragraphs[0].headings) == 1
    assert len(document.paragraphs[0].sentences) == 1

    assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne"
    assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn."

    assert len(document.paragraphs[1].headings) == 0
    assert len(document.paragraphs[1].sentences) == 2

    assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru."
    assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."
Example #41
0
def index():
    # url = "http://www.dawn.com/news/1216282"
    # -------------------------------------------------------------------------------
    # -------  Need help here ------------------#
    if request.method == 'POST':
         url = request.json.get('url')
         line_count = request.json.get('line_count')
    # ---------------------------------------------------------------------------

         parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
         print (parser)
    # stemmer = Stemmer(LANGUAGE)
    #
    # summarizer = Summarizer(stemmer)
    # summarizer.stop_words = get_stop_words(LANGUAGE)

    # s = ""
    # for sentence in summarizer(parser.document, SENTENCES_COUNT):
    #     s += str(sentence)

    return jsonify(dict(message='stuff'))
Example #42
0
    def _get_summary(self):
        if self.readable == '':
            return

        language = self.language.lower()
        if language == '':
            language = 'english'

        parser = HtmlParser.from_string(
            self.readable, self.url, Tokenizer(language))
        stemmer = Stemmer(language)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(language)
        summary = []
        for sentence in summarizer(parser.document, 10):
            if sentence.is_heading:
                summary.append('<h2>%s</h2>' % (unicode(sentence)))
            else:
                summary.append('<p>%s</p>' % (unicode(sentence)))

        self.summary = ''.join(summary)
Example #43
0
    def do_work(self, worker_id, work):
        url = work
        """Greenlet to fetch analyze URL content
        """
        print '[+] {0}: Starting crawl of {1}'.format(worker_id, url)

        """Using urllib2 via geventhttpclient. Selenium with 
        PhantomJS or a real browser would be probably better
        but slower and more expensive. Could have also used
        scrapy, but thats way to heavy for this use-case."""
        body = urlopen(url).read()

        """Using Sumy (built on nltk) for page summaries since
        it supports a number of ranking algorithms. It's not
        perfect though, it was written for czech and so its 
        missing some important English-specific things (e.g.
        bonus/significant words for Edmundson Summarizers)

        https://pypi.python.org/pypi/sumy

        TextBlob might be a better alternative, but it didn't
        seem to provide overall summary information. 

        https://textblob.readthedocs.org/en/latest/
        """
        parser = HtmlParser.from_string(body, None, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        words = []
        for sentence in summarizer(parser.document, 10):
            words = str(sentence).split()

        # Send the results
        self.work_done(worker_id, words)
Example #44
0
def download_sources(summarize=True, sources=currentFeeds):
    raw_documents = []
    complete_urls = []

    # Download News Stories
    converter = html2text.HTML2Text()
    converter.ignore_links = True
    converter.ignore_images = True
    converter.bypass_tables = True

    count_error = 0
    document_count = 0

    feed_count = -1

    for url in currentFeeds:
        feed_count += 1
        current_feed_document = 0

        currentStories = []
        feed = feedparser.parse(url[1])
        for story in feed.entries:
            current_feed_document += 1

            if story.title.startswith(u'VIDEO:') or story.title.startswith(u'AUDIO'):
                continue
            if story.link in complete_urls:
                continue

            try:
                res = requests.get(story.link)

                html = res.text
                title = story.title.encode('utf-8')
                
                completion = ((feed_count + (current_feed_document / float(len(feed.entries)))) / (float(len(currentFeeds))))* 100
                
                print "[" + ("%.2f" % completion) + "%] \t " + feed.feed.title.encode('utf-8') + " - " + title

                raw_text = converter.handle(html)
                if summarize:
                    parser = HtmlParser.from_string(html, None, Tokenizer("english"))
                
                    summarizer = LsaSummarizer(stem_word)
                    summarizer.stop_words = get_stop_words("english")

                    sum_text = [sentence for sentence in summarizer(parser.document, 20)]
                    raw_text = (" ".join([str(sentence) for sentence in sum_text])).decode('utf-8')
                    # print raw_text

                stats = TextBlob(raw_text)
                currentStories.append((title, raw_text, story.link, stats.sentiment, story.published_parsed))
                complete_urls.append(story.link)

                document_count += 1

            except KeyboardInterrupt:
                print "Quitting from Keyboard Interrupt."
                sys.exit(0)
            except:
                count_error += 1
                print "\t Error occurred while processing that story:", sys.exc_info()[0]
                traceback.print_exc()

        raw_documents.append((url[0], currentStories))

    print "Received", document_count, "documents with", count_error, "errors"
    return raw_documents
Example #45
0
def analyze_web_site(url):
    print("Main Points: %s \n" % url)
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    analyze(parser)    
Example #46
0
def summarize(entry, count):
    clean = lambda sentence: re.sub(r' (?:[;,:.!?])', '', unicode(sentence))
    parser = HtmlParser.from_string(entry.content, entry.url, tokenizer)
    sentences = map(clean, summarizer(parser.document, count))
    return '<ul>{}</ul>'.format(''.join(
        '<li>{}</li>'.format(sentence) for sentence in sentences))
Example #47
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# LANGUAGE = "english"
LANGUAGE = "czech"

SENTENCES_COUNT = 10

if __name__ == "__main__":
    # parser = PlaintextParser.from_file("yelp1.txt", Tokenizer(LANGUAGE))
    url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Example #48
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import lxml.html


list_of_pages = ['http://postach.us10.list-manage1.com/track/click?u=819841bd24897de296a130d94&id=1fbd285a11&e=01afa4fcef']

stemmer = Stemmer('English')
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words('English')

if __name__ == "__main__":
  for url in list_of_pages:
      parser = HtmlParser.from_url(url, Tokenizer('English'))
      print(lxml.html.parse(url).find(".//title").text)
      print(url),
      for sentence in summarizer(parser.document, 2):
          print(sentence),
Example #49
0
def summarize(url, sent_count=10):
    "Produces `sent_cout` sentence summaries of `url`."
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    return " ".join([str(sentence) for sentence
                     in summarizer(parser.document, sent_count)])
Example #50
0
def getSummaryFromWebsite(url, sentences_count):

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    return summarize(parser, sentences_count)