Example #1
0
def main(url, num_sentences=10, language='english'):
	parser = HtmlParser.from_url(url, Tokenizer(language))
	stemmer = Stemmer(language)
	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(language)
	for sentence in summarizer(parser.document, num_sentences):
		print(sentence)
Example #2
0
File: iatv.py Project: mtpain/iatv
def summarize(text, n_sentences, sep='\n'):
    '''
    Args:
        text (str or file): text itself or file in memory of text
        n_sentences (int): number of sentences to include in summary

    Kwargs:
        sep (str): separator to join summary sentences

    Returns:
        (str) n_sentences-long, automatically-produced summary of text
    '''

    if isinstance(text, str):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif isinstance(text, file):
        parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
    else:
        raise TypeError('text must be either str or file')

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
def summarizeFile(inputFile):
	summarizer = LsaSummarizer(stem_word)
	summarizer.stop_words = get_stop_words("english")
	url = findURLS(inputFile)
	if url != None:
		if url[-1] == '.':
			url = url[0:-1]
		#print (url)
		#urlContent = 'Summary from URL ['+url+']: \n'
		urlContent = ''
		try:
			parser = HtmlParser.from_url(url, Tokenizer("english"))		
			for sentence in summarizer(parser.document, 3):
				urlContent = urlContent + str(sentence) + '\n'
		except:
			#print (sys.exc_info()[0])
			urlContent = ''
	content = inputFile.read()
	parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
	#summarizer = LsaSummarizer(stem_word)
	#summarizer.stop_words = get_stop_words(LANGUAGE)
	#summary = 'Event Summary: \n'
	summary = ''
	try:
		for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
			summary = summary + str(sentence) + '\n'
	except AssertionError:
		return None
	if url != None:
		return summary + urlContent
	return summary
Example #4
0
def summarize(string, summary_length = 1, language = "english"):
    string = string.lower() if string.isupper() else string
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)

    return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)]) 
Example #5
0
def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"
Example #6
0
    def test_single_sentence(self):
        document = build_document(("I am the sentence you like",))
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the",)

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
Example #7
0
def summarize(text):
    total = ""
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        total += str(sentence)
    return total
Example #8
0
def lsa(comment,parser,num):
	summarizer = LsaSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)

	LSAstr = ''
	for sentence in summarizer(parser.document,num):
		LSAstr += str(sentence)

	return LSAstr
def summarize(filename, num_sentences):
    with open (filename, "r") as myfile:
        data=myfile.read()
    parser = PlaintextParser.from_string(data, Tokenizer('english')) 
    summarizer = LsaSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("english")
    summary = ""
    for sentence in summarizer(parser.document, num_sentences):
        summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " 
    return summary
Example #10
0
def retreive_sumy(url):
    # "http://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)	
    return summarizer(parser.document, SENTENCES_COUNT)
 def summary(self, int1, int2):
     # int1, int2 are the places between which to look for
     # the summary to be taken (slicing the corpus as a string)
     parser = PlaintextParser(self.corpus[int1:int2], Tokenizer("english"))
     summarizer = LsaSummarizer(stem_word)
     summarizer.stop_words = get_stop_words("english")
     self.summary_text = " ".join(
         map(lambda x:x._text,
             summarizer(parser.document, 20)))
     return self.summary_text
def summarize(content):
    parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    text = '\n'.join(
        [str(sentence) for sentence in summarizer(parser.document, COUNT)]
    )
    summary = Summary(content=content, summary=text)
    summary.save()
Example #13
0
 def summarizeText(self, body, numSentences = 10):
     """Summarizes body of text to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
Example #14
0
def summary(text):

    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    short = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n"
        #print(sentence)
    return short
Example #15
0
def summarize(parser, sentences_count):
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = ""
    for sentence in summarizer(parser.document, sentences_count):
        sentences += " " + str(sentence)

    return sentences
Example #16
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"),
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
Example #17
0
 def summarizeUrl(self, url, numSentences = 10):
     """Summarizes text at a given url to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = HtmlParser.from_url(url, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
     
Example #18
0
def test_article_example():
    """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
    parser = PlaintextParser.from_string(
        load_resource("articles/prevko_cz_1.txt"),
        Tokenizer("czech")
    )
    summarizer = LsaSummarizer(Stemmer("czech"))
    summarizer.stop_words = get_stop_words("czech")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
Example #19
0
def summarize(text):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    result = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result += str(sentence) + " "

    return result
Example #20
0
    def test_issue_5_sigma_can_multiply_matrix_v(self):
        """Source: https://github.com/miso-belica/sumy/issues/5"""
        parser = PlaintextParser.from_string(
            load_resource("articles/sigma_can_multiply_matrix_v.txt"),
            Tokenizer("english")
        )
        summarizer = LsaSummarizer(english_stemmer)
        summarizer.stop_words = get_stop_words("english")

        sentences = summarizer(parser.document, 20)
        self.assertEqual(len(sentences), 20)
Example #21
0
    def lsa(self,text_parser):
        assert isinstance(text_parser,plaintext.PlaintextParser)

        #process the text
        summarizer=LSA()
        #EnglishStemmer())
        #summarizer.stop_words=stopwords.words("english")

        #we have to specify stop words
        summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE)
        return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
Example #22
0
    def test_document(self):
        document = build_document(
            ("I am the sentence you like", "Do you like me too",),
            ("This sentence is better than that above", "Are you kidding me",)
        )
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the", "you", "are", "me", "is", "than", "that", "this",)

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
        self.assertEqual(to_unicode(sentences[1]), "This sentence is better than that above")
Example #23
0
def get_summary(text, max_sentences=5):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	stemmer = Stemmer("english")

	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words("english")

	summary = []
	for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10
		summary.append(str(sentence._text.encode('ascii', 'ignore')))

	return summary
Example #24
0
def lsaReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = LsaSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
Example #25
0
    def test_dictionary_without_stop_words(self):
        summarizer = LsaSummarizer()
        summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

        document = build_document(
            ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
            ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
            ("Some relevant sentence", "Some moRe releVant sentEnce",),
        )

        expected = frozenset(["some", "more", "relevant", "sentence"])
        dictionary = summarizer._create_dictionary(document)
        self.assertEqual(expected, frozenset(dictionary.keys()))
Example #26
0
def test_issue_5_svd_converges():
    """Source: https://github.com/miso-belica/sumy/issues/5"""
    pytest.skip("Can't reproduce the issue.")

    parser = PlaintextParser.from_string(
        load_resource("articles/svd_converges.txt"),
        Tokenizer("english")
    )
    summarizer = LsaSummarizer(Stemmer("english"))
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
def sum_spark(doc):

    parser = PlaintextParser.from_string(doc,Tokenizer('english'))

    summarizer = Summarizer(Stemmer('english'))
    summarizer.stop_words = stop_books
    
    texts=[]

    for sentence in summarizer(parser.document, 2):
        texts.append(str(sentence))

    return texts
Example #28
0
    def getText(self, sentence_count=None):
        if sentence_count:
            self.SENTENCE_COUNT = sentence_count
        parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        text_list = []

        for sentence  in summarizer(parser.document, self.SENTENCE_COUNT):
            text_list.append(str(sentence))
        return "\n".join(text_list)
Example #29
0
def extract_titles (reviews):

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for item_id, review in reviews.iteritems():
        print "Review: {}".format(review)
        print "\n"
        #sentences = re.split(r' *[\.\?!][\'"\)\]]* *', review)

        for sentence in summarizer(build_document_from_string(review), SENTENCES_COUNT):
            print sentence
        print "\n"
Example #30
0
def summarize_text(textbody):
    parser = PlaintextParser.from_string(textbody, Tokenizer(LANG))
    stemmer = Stemmer(LANG)

    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANG)

    summary = summarizer(parser.document, SENTENCE_COUNT)

    summarized_text = ''
    for sentence in summary:
        summarized_text += str(sentence) + ' '

    return summarized_text
def update_db(stored_result,db,query):
	result = resource.list(q= query, cx = search_engine_id).execute()
	query_json= stored_result[0]
	stored_sources = []
	for news in query_json["News"]:
		news_dict = news[-1]
		url = news_dict["source"]
		response = requests.get(news_dict["source"])
		stored_sources.append(news_dict["source"])
		if 'Last-Modified' in response.headers:
			if time.strptime(response.headers['Last-Modified'],"%a, %d %b %Y %H:%M:%S %Z") > time.strptime(news_dict['last_modified'],"%a, %d %b %Y %H:%M:%S %Z"):
				current_result = {}
				parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
				stemmer = Stemmer(language=LANGUAGE)
				summarizer = Summarizer(stemmer)
				summarizer.stop_words = get_stop_words(LANGUAGE)
				summary = summarizer(parser.document, 5)
				summary = '\n'.join([line._text for line in summary])
				current_result['content'] = []
				current_result['content'].append(summary)
				current_result['source'] = news_dict["source"]
				current_result['last_modified'] = response.headers['Last-Modified']
				news.append(current_result)

		"""else:
			print(url)
			stored_content = news_dict["content"]
			parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
			stemmer = Stemmer(language=LANGUAGE)
			summarizer = Summarizer(stemmer)
			summarizer.stop_words = get_stop_words(LANGUAGE)
			summary = summarizer(parser.document, 5)
			summary = '\n'.join([line._text for line in summary])
			if stored_content[0] != summary:
				current_result = {}
				current_result['content'] = []
				current_result['content'].append(summary)
				current_result['source'] = news_dict["source"]
				current_result['last_modified'] = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime())
				news.append(current_result)"""

	for item in result['items']:
		try:
			if item['link'] not in stored_sources:
				url = item['link']
				if 'pdf' in url or 'xml.gz' in url:
					continue
				current_result = {}
				current_result['source'] = url
				current_result['content'] = []

				response = requests.get(url)
				parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
				stemmer = Stemmer(language=LANGUAGE)
				summarizer = Summarizer(stemmer)
				summarizer.stop_words = get_stop_words(LANGUAGE)
				summary = summarizer(parser.document, 5)
				summary = '\n'.join([line._text for line in summary])						
				current_result['content'].append(summary)
				if 'Last-Modified' in response.headers:
					current_result['last_modified'] = response.headers['Last-Modified']
				else:
					current_result['last_modified'] = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime())
				query_json['News'].append([current_result])


		except urllib.error.HTTPError as e:
			current_result['content'] = ["No results available"]
			continue

		except TypeError:
			current_result['content'] = ["No results available"]
			continue

		except AttributeError:
			current_result['content'] = ["No results available"]
			continue

		except requests.exceptions.SSLError as e:
			current_result['content'] = ["No results available"]
			continue


		db["news"].save(query_json)
Example #32
0
#USING LSA
#Based on term frequency techniques with singular value decomposition to summarize texts.
from sumy.parsers.plaintext import PlaintextParser #We're choosing a plaintext parser here, other parsers available for HTML etc.
from sumy.nlp.tokenizers import Tokenizer 
from sumy.summarizers.lsa import LsaSummarizer

file = "plain_text.txt" #name of the plain-text file
parser = PlaintextParser.from_file(file, Tokenizer("english"))
summarizer_lsa = LsaSummarizer()
summary_2 =summarizer_lsa(parser.document,2)
for sentence in summary_2:
    print(sentence)
Example #33
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 10

if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Example #34
0
def clicked():
    file = open('testfile.txt', 'a')
    #website to text file as testfile.txt
    html = requests.get(url1.get()).content
    #1 Recoding
    unicode_str = html.decode("utf8")
    encoded_str = unicode_str.encode("ascii", 'ignore')
    news_soup = BeautifulSoup(encoded_str, "html.parser")
    title = news_soup.find_all('h1')
    z = [re.sub(r'<.+?>', r'', str(b)) for b in title]
    s1 = ''.join(z) + '.' + '\n'
    file.write(s1)

    #finding the summary of text file and again store it into testfile.txt
    LANGUAGE = "english"
    SENTENCES_COUNT = 10

    if __name__ == "__main__":

        url = url1.get()

        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))

        print("--LuhnSummarizer--")
        summarizer = LuhnSummarizer()
        summarizer = LsaSummarizer(Stemmer(LANGUAGE))
        summarizer.stop_words = (
            "I",
            "am",
            "the",
            "you",
            "are",
            "me",
            "is",
            "than",
            "that",
            "this",
        )
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            str1 = str(sentence)
            file.write(str1)
        file.close()

    #open the text file and divide it into 10 parts as 0 to 9.txt
    str1 = open('testfile.txt', 'r').read()
    #print(str1)
    l = str1.split(".")
    i = len(l)
    for j in range(8):
        file = open('text/' + str(j) + '.txt', 'a')
        s0 = ''.join(l[j])
        file.write(s0)

    def _patch_faulty_function(self):
        if self.token_key is not None:
            return self.token_key
        timestamp = calendar.timegm(time.gmtime())
        hours = int(math.floor(timestamp / 3600))

        response = requests.get("https://translate.google.com/")
        line = response.text.split('\n')[-1]
        parsed = re.search("(?:TKK='(?:(\d+)\.(\d+))';)", line)
        a, b = parsed.groups()
        result = str(hours) + "." + str(int(a) + int(b))
        self.token_key = result
        return result

    # Monkey patch faulty function.
    Token._get_token_key = _patch_faulty_function

    # Then call it normally.
    #with open('testfile.txt', 'r') as myfile:
    #   data=myfile.readlines()

    for k in range(8):
        str1 = open('text/' + str(k) + '.txt', 'r').read()
        #print(str1)
        #str1 = "my name is khan"
        if (len(str1) != 0):
            tts = gTTS(str1)
            tts.save('voice/' + str(k) + '.mp3')

    keyword = open('text/0.txt', 'r').read()
    #print(keyword)
    st = 'googleimagesdownload --keywords "' + keyword + '" --limit 8'

    os.system(st)
    os.system("D:/VideoBeta/VideoBeta.exe")
Example #35
0
def get_summary(textss, truereq, numofsent):
    output_sentences = []
    hold = ''
    truecount = 0
    store = ''
    store = keywords(
        textss, ratio=0.05)  #extracting the most relevant words from full text
    store1 = str(store)
    holdfirst = nltk.word_tokenize(
        store1)  #storing the tokenized string (keywords) to remove punctuation
    parser = PlaintextParser.from_string(
        textss, Tokenizer(LANGUAGE))  #storing the full text into an object
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sentencess = []
    compare = []
    TEMP_FOLDER = tempfile.gettempdir()
    documents = sent_tokenize(textss)  #storing sentences of full text
    summalen = len(documents)  #storing the number of sentences
    stoplist = set('for a of the and to in'.split())

    for sentence in summarizer(parser.document, numofsent):
        hold = str(sentence)
        ttt = nltk.word_tokenize(hold)
        count = 0
        for i in range(0, len(ttt)):
            for j in range(0, len(holdfirst)):
                if ttt[i] == holdfirst[j]:
                    count += 1
        compare.append(count)
        sentencess.append(str(sentence))

    texts = [
        [word for word in document.lower().split() if word not in stoplist]
        for document in documents
    ]  #storing an array of sentences where each sentence is a list of words without stopwords
    frequency = defaultdict(
        int
    )  #storing a subclass that calls a factory function to supply missing values

    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts
             ]  #storing an array of words that occur more than once

    dictionary = corpora.Dictionary(texts)  #storing a map of words
    dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict'))
    new_doc = str(textss.encode(
        'utf-8'))  #storing the utf-8 version of textss (original)
    new_vec = dictionary.doc2bow(
        new_doc.lower().split()
    )  #converting the utf-8 econded textss into a bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded).

    corpus = [
        dictionary.doc2bow(text) for text in texts
    ]  #applying doc2bow to texts(list of  words that occur more than once) save into an array
    corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'),
                               corpus)
    dictionary = corpora.Dictionary.load(
        os.path.join(TEMP_FOLDER, 'deerwester.dict'))
    corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm'))
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    doc = str(textss.encode('utf-8'))
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lsi = lsi[vec_bow]  #converting the query to LSI space
    index = similarities.MatrixSimilarity(lsi[corpus])
    index.save(os.path.join(TEMP_FOLDER, 'deerwester.index'))
    index = similarities.MatrixSimilarity.load(
        os.path.join(TEMP_FOLDER, 'deerwester.index'))
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    newlist = []

    for i in range(0, summalen):
        newlist.append(documents[sims[i][0]])
        if i == 4:
            break

    for sentencez in newlist:
        hold = str(sentencez)
        ttt = nltk.word_tokenize(hold)
        count = 0

        for i in range(0, len(ttt)):
            for j in range(0, len(holdfirst)):
                if ttt[i] == holdfirst[j]:
                    count += 1
        compare.append(count)
        sentencess.append(str(sentencez))
    i = 0
    while i < truereq:
        holdsubs = []
        indexes = compare.index(max(compare))
        doc1 = nlp(u'%s' % str(sentencess[indexes]))
        parse = doc1
        for word in parse:
            if word.dep_ == 'nsubj':
                holdsubs.append(word.text.lower())
        if holdsubs:
            if holdsubs[0] != 'they' and holdsubs[0] != 'their' and holdsubs[
                    0] != 'both' and holdsubs[0] != 'these' and holdsubs[
                        0] != 'this':
                countcomma = str(sentencess[indexes]).count(',')
                if countcomma < 7:
                    output_sentences.append(sentencess[indexes])
                    i += 1
        del sentencess[indexes]
        del compare[indexes]
    return output_sentences
Example #36
0
# ## Package sumy

# In[8]:

import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# In[9]:

parser = PlaintextParser.from_string(rawdata.news[1], Tokenizer("english"))
stemmer = Stemmer("english")
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words("english")

for sentence in summarizer(parser.document, 6):
    print(sentence)

# In[10]:

testchinese = '温客行一眼就看出周子舒使用的是四季山庄的流云九宫步,狠狠教训了顾湘一顿,就带她离开了。张成岭看出周子舒有一身好武功,只是深藏不露,就主动过来和周子舒寒暄,还给他一块名帖,让他有事去镜湖山庄,张成岭着急给母亲买点心,就先行离开了。周子舒听到孩子们在唱那首五湖盟争夺武林盟主以及琉璃甲的歌谣,不禁感慨江湖的风云多变。周子舒叫醒岸边的摆渡船夫,他要乘船去镜湖山庄,摆渡船夫趁机狮子大开口,周子舒也不还价,摆渡船夫看他一副病恹恹的模样,不忍心敲诈他,温客行带顾湘及时赶来,主动提出送周子舒去镜湖山庄,摆渡船夫不依不饶,拉起周子舒就上船离开了。周子舒远远就发现镜湖山庄犹如人间仙境,他迫不及待赶过去,下船就忘了付钱,遭到摆渡船夫劈头盖脸一顿臭骂,周子舒索性就坐一次霸王船。周子舒施展轻功,很快就进入镜湖山庄的桃林,他沉醉于花香之中,温客行突然从背后偷袭,周子舒只能迎战,两个人交手几个回合,温客行对周子舒心生佩服,请他喝酒小叙,周子舒断然拒绝。周子舒来到镜湖山庄,从管家口中得知镜湖派掌门张玉森久不闻江湖事,他有三个儿子张成峰,张成峦和张成岭,也不许他们掺和江湖门派之争,管家把周子舒安顿到柴房,子时的时候,三秋钉又准时开始发作,周子舒感觉浑身疼痛难忍,只能发动全部功力为自己疗伤,突然听到外面人声嘈杂。周子舒打开门发现镜湖山庄已经变成一片火海,他飞身上屋顶观察,发现带着鬼面具的人在镜湖山庄大肆烧杀抢掠,怀疑是鬼谷的人所为,他立刻下去救人,张玉森,张成峦和张成峰父子三人被抓走,镜湖山庄的人几乎全部被杀,尸横遍野。摆渡船夫保护着张成岭想逃走,被鬼谷的人追杀,周子舒出手相救,掩护着他们俩乘船离开,远远看到温客行坐在华亭伤看热闹。周子舒把摆渡船夫和张成岭带到一间破庙,摆渡船夫说明张玉森曾经救过他的命,他在镜湖山庄门前摆渡三年,就是想等有朝一日报恩,摆渡船夫让张成岭去太湖找三白大侠,张成岭坚决不走。外面阴风阵阵,一群带鬼面具的人冲进来,一个自称吊死鬼的人叫嚣着进来抓张成岭,周子舒因为体力耗尽要静养半个时辰,摆渡船夫和吊死鬼战在一处,他渐渐体力不支被打翻在地,吊死鬼要杀了周子舒,张成岭拼命保护他,顾湘及时赶来,她和黑白无常大打出手,吊死鬼想杀张成岭,摆渡船夫奋不顾身护住他,被打成重伤。顾湘被恶鬼们团团包围,周子舒挣扎着跳起来为顾湘解围,把恶鬼们全部打跑,他因体力不支差点晕倒,温客行赶来抱住周子舒。摆渡船夫因为失血过多奄奄一息,温客行用内力帮他维持,船夫拜托周子舒把张成岭交给五湖盟的赵敬,还让张成岭当场给周子舒跪下磕头,周子舒满口答应,摆渡船夫说完这些话就咽气了。周子舒帮张成岭把摆渡船夫埋葬,张成岭累得精疲力尽,周子舒打算休息一夜再上路,温客行让顾湘生火,把干粮烤了给周子舒和张成岭,周子舒借口不饿不想吃,顾湘对他冷嘲热讽,张成岭也不吃顾湘的干粮,遭到顾湘的训斥,谴责他不知道报恩,张成岭连连向她赔礼道歉。温客行发现张成岭身受重伤,主动提出帮他医治,周子舒坚决不同意,两个人一言不合就大打出手。'
parser = PlaintextParser.from_string(testchinese, Tokenizer("chinese"))
stemmer = Stemmer("chinese")
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words("chinese")

for sentence in summarizer(parser.document, 6):
    print(sentence)
def get_update():

    try: 
        from googlesearch import search 
    except ImportError:  
        print("No module named 'google' found") 
      
    # to search 
    query = "covid-19 google scholar" #google scholer,  Czech Republic

    update ={}
      
    for url in search(query, tld="co.in", num=10, stop=2, pause=2): 
        
        print(url)        
        web_response = requests.get(url) 
  
        # building 
        element_tree = lxml.html.fromstring(web_response.text) 
          
        tree_title_element = element_tree.xpath('//title')[0] 
          
        #print("Tag title : ", tree_title_element.tag) 
        print("\nText title :", tree_title_element.text_content()) 

        print("\n")
        
        #print("\nhtml title :", lxml.html.tostring(tree_title_element)) 
        #print("\ntitle tag:", tree_title_element.tag) 
        #print("\nParent's tag title:", tree_title_element.getparent().tag) 



        
        #url = "https://academic.oup.com/clinchem/advance-article/doi/10.1093/clinchem/hvaa029/5719336"
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        # print(summarizer._text)
        summarizer.stop_words = get_stop_words(LANGUAGE)


        sentence_list = []

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            #print(dir(sentence))
            # print(sentence._text)
            sentence_list.append(sentence._text)
        sentences = (" ".join(sentence_list))
        update[tree_title_element.text_content()] = sentences
        # print("\n")    
        # for i in range(0,len(sentence_list),1):    
        #   time.sleep(2)
        #   translations = translator.translate([sentence_list[i]], dest='bn')    
        #   for translation in translations:
        #     print(translation.text)
        #   #print(translation.origin, ' -> ', translation.text)

        # print("\n")

        # translations = []
        # for sentence in sentence_list:
        #   translations.append(translator.translate(,dest='bn'))
        
        # print(translations)
    return update
 def __init__(self):
   self.lsa_summarizer = LsaSummarizer(stemmer)
   self.lex_rank_summarizer = LexRankSummarizer(stemmer)
   self.lsa_summarizer.stop_words = get_stop_words(LANGUAGE)
   self.lex_rank_summarizer.stop_words = get_stop_words(LANGUAGE)
   self.email_text_parser = SbEmailTextParser()
Example #39
0
def mySumD():
	if request.form['action'] == 'LSA':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		summarizer = Summarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'Luhn':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		summarizer = LuhnSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'LexRank':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		summarizer = LexSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'TextRank':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

		stemmer = Stemmer(LANGUAGE)
		summarizer = TextSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'SumBasic':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

		stemmer = Stemmer(LANGUAGE)
		summarizer = SumSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	else:
		para = ""
		request.form['action'] == 'KL-Sum'
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

		stemmer = Stemmer(LANGUAGE)
		summarizer = KLSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
Example #40
0
def test_empty_document():
    document = build_document()
    summarizer = LsaSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 0
Example #41
0
    def summarize4(self, df):
        #http://ai.intelligentonlinetools.com/ml/text-summarization/
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        stopwords = nltk.corpus.stopwords.words('english')
        for row in df['conclusion']:
            if row == '0' or row == '':
                continue
            parser = PlaintextParser(row, Tokenizer(LANGUAGE))
            print("--LsaSummarizer--")
            summarizer = LsaSummarizer()
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
            summarizer.stop_words = get_stop_words(LANGUAGE)
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                print(sentence)

            print("--LuhnSummarizer--")
            summarizer = LuhnSummarizer()

            summarizer.stop_words = stopwords
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                print(sentence)

            print("--EdmundsonSummarizer--")
            summarizer = EdmundsonSummarizer()
            words = ("deep", "learning", "neural")
            summarizer.bonus_words = words

            words = (
                "another",
                "and",
                "some",
                "next",
            )
            summarizer.stigma_words = words

            words = (
                "another",
                "and",
                "some",
                "next",
            )
            summarizer.null_words = words
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                print(sentence)
Example #42
0
def summarize(url, number = 5):
  parser = HtmlParser.from_url(url, Tokenizer("english"))
  stemmer = Stemmer("english")
  summarizer = Summarizer(stemmer)
  summarizer.stop_words = get_stop_words("english")
  return " ".join(str(i) for i in summarizer(parser.document, number))
def summarize():
    response.content_type = "application/json"
    incoming = request.json
    if incoming.get('token', None) != SECRET_TOKEN:
        return

    channel = incoming.get('channel_id', None)
    if channel is None:
        return

    query = incoming.get('text', '').replace('!tldr', '').strip()
    count = 150
    if query != "":
        try:
            count = int(
                query) if int(query) > 10 and int(query) < 1000 else count
        except ValueError:
            pass

    r = requests.post(API_URL + "login",
                      data=json.dumps({
                          "username": USERNAME,
                          "password": PASSWORD
                      }),
                      headers={"Content-type": "application/json"})

    try:
        user = r.json()
    except Exception as e:
        print("BAILING OUT (login):\n{}".format(e))
        return

    userdata = user.get('data', None)
    if userdata is None:
        print("Login failed")
        return

    uid = userdata.get('userId', None)
    authToken = userdata.get('authToken', None)

    if uid is None or authToken is None:
        print("uid or token was invalid")
        return

    r = requests.get(API_URL + \
                     "channels.history?roomId={}&count={}".format(channel, count),
                    headers={"X-Auth-Token": authToken,
                             "X-User-Id": uid})

    try:
        history = r.json()
    except Exception as e:
        print("BAILING OUT (history):\n{}".format(e))
        return

    last = history['messages'][1]  # 0 is !tldr
    if last.get('urls', []) != []:
        summaries = []
        for url in last['urls']:
            parser = HtmlParser.from_url(url['url'], Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)
            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)
            summaries.append(
                "> {}".format(" ".join([str(sentence)\
                 for sentence in summarizer(parser.document,
                                            SENTENCES_COUNT)])))
        output = json.dumps({"text": "\n--\n".join(summaries)})
        return output

    messages = ". ".join(
        [m['msg'] for m in history['messages'][::-1] \
         if m['msg'] != "" and m.get('bot', None) is None])

    parser = PlaintextParser.from_string(messages, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return json.dumps({
        "text":
        "\n--\n".join([
            "> {}".format(str(sentence))
            for sentence in summarizer(parser.document, SENTENCES_COUNT)
        ])
    })
Example #44
0
def lsaer(text, count):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer_lsa = LsaSummarizer()
	summary_2 =summarizer_lsa(parser.document, count)

	return summary_2
Example #45
0
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.nlp.tokenizers import Tokenizer
import sys


def leadSummariser(document, no_of_sents):
    for sent in document.sentences[:no_of_sents]:
        yield str(sent)


summarisers = {
    "lead": leadSummariser,
    "luhn": LuhnSummarizer(),
    "lsa": LsaSummarizer(),
    "lex_rank": LexRankSummarizer(),
    "text_rank": TextRankSummarizer(),
    "sum_basic": SumBasicSummarizer(),
    "kl": KLSummarizer()
}

tokenizer = Tokenizer("english")


def to_words(str):
    return str.split(" ")


def extractive(article, title=None):
    raw = article.replace(' <sb>', '').strip()
Example #46
0
def find_summary_lsa():
    p = PlaintextParser.from_file("testtext.txt", Tokenizer("english"))
    sumlsa = LsaSummarizer()
    su = sumlsa(p.document, 2)
    for s in su:
        print(s)
Example #47
0
from modules.sql import dBAdapter
from modules.pre import create_corpus as c
from nltk import sent_tokenize

n_documents = 4

#----------------------------------------------------------------------------
print("Getting body subtitles from the database started ...")
dbAdapter = dBAdapter.Database()
dbAdapter.open()
dic_subtitles = dict(dbAdapter.selectDic_subtitles_limit(n_documents))
dbAdapter.close()
print("finalizada consulta")

string = sent_tokenize(list(dic_subtitles.values())[0])

from sumy.parsers.plaintext import PlaintextParser
#for tokenization
from sumy.nlp.tokenizers import Tokenizer

parser = PlaintextParser.from_string(
    list(dic_subtitles.values())[0], Tokenizer("spanish"))

from sumy.summarizers.lsa import LsaSummarizer
summarizer_2 = LsaSummarizer()
summary_2 = summarizer_2(parser.document, 10)
summ_list = []
for sentence in summary_2:
    summ_list.append(sentence._text)
summ_text = " ".join(summ_list)
def sumySummarize(filename, language="english", num_sents=1):
    """
    Luhn's algorithm is the most basic:
    1. Ignore Stopwords
    2. Determine Top Words: The most often occuring words in the document are counted up.
    3. Select Top Words: A small number of the top words are selected to be used for scoring.
    4. Select Top Sentences: Sentences are scored according to how many of the top words they 
    contain. The top N sentences are selected for the summary.
    
    SumBasic uses a simple concept:
    1. get word prob. p(wi) = ni/N (ni = no. of times word w exists, N is total no. of words)
    2. get sentence score sj = sum_{wi in sj} p(wi)/|wi| (|wi| = no. of times wi comes in sj)
    3. choose sj with highest score
    4. update pnew(wi) = pold(wi)^2 for words in the chosen sentence (we want probability to include the same words to go down)
    5. repeat until you reach desired no. of sentences
    
    KL algorithm solves arg min_{S} KL(PD || PS) s.t. len(S) <= # sentences, where 
    	KL = Kullback-Lieber divergence = sum_{w} PD(w)log(PD(w)/PS(w))
    	PD = unigram word distribution of the entire document
    	PS = unigram word distribution of the summary (optimization variable)
    
    LexRank and TextRank use a PageRank kind of algorithm
    1. Treat each sentence as the node in the graph
    2. Connect all sentences to get a complete graph (a clique basically)
    3. Find similarity between si and sj to get weight Mij of the edge conecting i and j
    4. Solve the eigen value problem Mp = p for similarity matrix M.
    5. L = 0.15 + 0.85*Mp.  L gives the final score for each sentence.  Pick the top sentences
    LexRank uses a tf-idf modified cosine similarity for M.  TextRank uses some other similarity metric
    
    LSA uses a SVD based approach
    1. Get the term-sentence matrix A (rows is terms, columns is sentences).  Normalize with term-frequency (tf) only
    2. Do SVD; A = USV' (A=m x n, U=m x n, S=n x n, V=n x n)
    SVD derives the latent semantic structure of sentences.  The k dimensional sub-space get the key k topics
    of the entire text structure.  It's a mapping from n-dimensions to k
    If a word combination pattern is salient and recurring in document, this
    pattern will be captured and represented by one of the singular vectors. The magnitude of the
    corresponding singular value indicates the importance degree of this pattern within the
    document. Any sentences containing this word combination pattern will be projected along
    this singular vector, and the sentence that best represents this pattern will have the largest
    index value with this vector. As each particular word combination pattern describes a certain
    topic/concept in the document, the facts described above naturally lead to the hypothesis that
    each singular vector represents a salient topic/concept of the document, and the magnitude of
    its corresponding singular value represents the degree of importance of the salient
    topic/concept.
    Based on this, summarization can be based on matrix V.  V describes an importance degree 
    of each topic in each sentence. It means that the k’th sentence we choose has the largest 
    index value in k’th right singular vector in matrix V.  An extension of this is using 
    SV' as the score for each sentence
    """
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words
    from sumy.summarizers.luhn import LuhnSummarizer
    from sumy.summarizers.lsa import LsaSummarizer
    from sumy.summarizers.text_rank import TextRankSummarizer
    from sumy.summarizers.lex_rank import LexRankSummarizer
    from sumy.summarizers.sum_basic import SumBasicSummarizer
    from sumy.summarizers.kl import KLSummarizer

    parser = PlaintextParser.from_file(filename, Tokenizer(language))

    def getSummary(sumyAlgorithm):
        sumyAlgorithm.stop_words = get_stop_words(language)
        summary = sumyAlgorithm(parser.document, num_sents)
        sents = " ".join([str(sentence) for sentence in summary])
        return sents

    stemmer = Stemmer(language)

    summaries = {}
    summaries['Luhn'] = getSummary(LuhnSummarizer(stemmer))
    summaries['LSA'] = getSummary(LsaSummarizer(stemmer))
    summaries['TextRank'] = getSummary(TextRankSummarizer(stemmer))
    summaries['LexRank'] = getSummary(LexRankSummarizer(stemmer))
    summaries['SumBasic'] = getSummary(SumBasicSummarizer(stemmer))
    summaries['KL'] = getSummary(KLSummarizer(stemmer))

    print("")
    print("####### From Sumy #######")
    print(summaries)
Example #49
0
SENTENCES_COUNT = 4

parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)

print("\n===== Luhn =====")
summarizerLuhn = LuhnSummarizer(stemmer)
summarizerLuhn.stop_words = get_stop_words(LANGUAGE)
for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT):
    print(sentenceLuhn, "\n")

print("\n===== TextRank =====")
summarizerTR = TextRankSummarizer(stemmer)
summarizerTR.stop_words = get_stop_words(LANGUAGE)
for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT):
    print(sentenceTR, "\n")

print("\n===== LSA =====")
summarizerLSA = LsaSummarizer(stemmer)
summarizerLSA.stop_words = get_stop_words(LANGUAGE)
for sentenceLSA in summarizerLSA(parser.document, SENTENCES_COUNT):
    print(sentenceLSA, "\n")

print("\n===== Edmonson =====")
summarizerEd = EdmundsonSummarizer(stemmer)
summarizerEd.bonus_words = ('focus', 'proposed', 'method', 'describes')
summarizerEd.stigma_words = ('example')
summarizerEd.null_words = ('literature', 'however')
for sentenceEd in summarizerEd(parser.document, SENTENCES_COUNT):
    print(sentenceEd, "\n")
rouge_scores = list()
for file in os.listdir('datafiles'):
    with codecs.open('datafiles/' + file,
                     'r',
                     encoding='utf-8',
                     errors='ignore') as f:
        parser = PlaintextParser.from_string(f.read().replace('\n', ' '),
                                             UrduTokenizer)
        objectDocModel = parser.document
        print(objectDocModel.sentences)
        print(objectDocModel.paragraphs)
        print(objectDocModel.words)
        print(objectDocModel.headings)

        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words("Urdu")
        summ = summarizer(parser.document, SENTENCES_COUNT)
        with open('dataresults/' + file.split('.')[0] + '.txt', 'w') as fw:
            for sentence in summ:
                #print sentence
                evaluated_sentences.append(sentence)
                fw.writelines(str(sentence))
        #list of rouge scores (bigrams)
        res = rouge_1(evaluated_sentences, objectDocModel.sentences)
        rouge_scores.append(res)
        evaluated_sentences.clear()

        fw.close()
    f.close()
Example #51
0
def func(file1, username, wc):
    Summary = ""
    packet = BytesIO()
    packet.seek(0)

    filename = "/" + file1
    #print(user)
    config = {
        "apiKey": "AIzaSyDvTZQo3KQIWvDmMwP16ItJ_DaJEylIGrc",
        "authDomain": "fir-android-c7a0d.firebaseapp.com",
        "databaseURL": "https://fir-android-c7a0d.firebaseio.com",
        "storageBucket": "fir-android-c7a0d.appspot.com"
    }
    firebase = pyrebase.initialize_app(config)
    stor = firebase.storage()
    #os.remove("T3.pdf")
    stor.child(filename).download("T3.pdf")
    pdf_document = "T3.pdf"
    doc = fitz.open(pdf_document)
    page_Count = doc.pageCount
    for v in range(0, int(page_Count)):
        page1 = doc.loadPage(v)
        pageText = page1.getText("text")
        # Get text from StringIO
        text = pageText
        text1 = ""
        text3 = ""
        count = 0
        r = len(text)
        for i in range(1, r - 1):
            if text[i] == " " and text[i + 1] == " ":
                text.replace(text[i], "")
                count += 1

            if text[i] == '\t' or text[i] == '\n':
                text.replace(text[i], " ")
                count += 1
            r = len(text)
        t = 0
        i = 0
        j = 0
        k = 0
        flag1 = 0
        for i in range(t, len(text)):
            if text[i] == '.':
                for j in range(i + 1, len(text)):
                    if text[j] == '.':
                        text1 = text[i:j]
                        for k in text1:
                            flag1 = 0
                            if k in {':', '!', '-', '(', ')'}:
                                flag1 = 1
                                break
                        if flag1 == 1:
                            break
                        break
                if flag1 == 1:
                    continue
                else:
                    text3 = text3 + text[i:j]
            t = j
        r = 0
        for i in range(0, r - 1):
            if text3[i] == '.' and text3[i + 1] != ' ':
                text3 = text3.replace(text3[i + 1], '')
            r = len(text3)
        w_count = int(wc)
        W_Count = 0
        if w_count == 0:
            w_count = 50
        else:
            W_Count = w_count
        counters = 0
        for p in text3:
            if p == " ":
                counters += 1
        if counters < 20:
            continue
        #out=summarize(text3,ratio=(W_Count*.01))
        LANGUAGE = "english"
        SENTENCES_COUNT = W_Count
        parser = PlaintextParser.from_string(text3, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        out = ""
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            out += str(sentence)
        if out == "":
            out = "Not enough words in this page to summarize"
        Summary = Summary + "\n\n Page No : " + str(v + 1) + "\n\n"
        Summary = Summary + " " + out
    lengther = 0
    for i in Summary:
        lengther += 1
        if i == '.':
            break
    out1 = Summary[lengther:len(Summary)]
    out = "Summary\n\n\n Page No: 1\n\n" + out1

    outfile = 'final.txt'
    with open(outfile, "w+") as filer:
        filer.write(out)
    filer.close()
    bucket = storage.bucket()

    blob = bucket.blob(str(username) + '/' + 'final.txt')
    blob.upload_from_filename(outfile)
    #os.remove(outfile)

    return out
Example #52
0
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer

LANGUAGE = "english"
SENTENCES_COUNT = 10

url="https://en.wikipedia.org/wiki/Artificial_intelligence"

parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

summary1 = ""

print("\n\n")
print ("--LsaSummarizer--")    
summarizer = LsaSummarizer()
summarizer = LsaSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    summary1+=str(sentence)
    summary1+=" "

with open("summarised_text.txt", "w", encoding="utf8") as myfile:
    myfile.write("\n\nLSA:\n")
    myfile.write(summary1)

summary2 = ""
print("\n\n")
print ("--LuhnSummarizer--")     
summarizer = LuhnSummarizer() 
def summarize(SENTENCES_COUNT):
    try:
        #    url = "https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/"
        speak.Speak("Please Enter the U r l for summarization")

        url = easygui.textbox(
            msg='Enter url for which you want summarization:',
            title='Summarization').split()[0]
        title = getTextFromURL(url)
        #    url="https://medium.com/@gurunathrajagopal/what-happens-when-machines-are-curious-to-learn-9aed6805bf36"
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        string_dict = {}
        for idx, sentence in enumerate(
                summarizer(parser.document, SENTENCES_COUNT)):
            #        f.write(str(sentence))
            string_dict[idx] = str(sentence)
    #        print(type(sentence))
    #    print(string_dict)


#        speak.Speak("Please Enter the filename to save to summarization")

#        file_name=easygui.textbox(msg='Enter filename to save the summarization:',title='Summarization').split()[0]
#        current_dir=os.getcwd()
#    f=open(current_dir+'\\'+str(file_name)+'.txt','w')
#    f.write('Summarization')
        document = Document()

        document.add_heading('Summarization of ' + str(title), 0)
        p = document.add_paragraph(
            'Summarizing your article in crisp {} points'.format(
                SENTENCES_COUNT))

        for idx, sent in zip(string_dict.keys(), string_dict.values()):
            adding_break = p.add_run()
            adding_break.add_break()
            p = document.add_paragraph(sent)
        adding_break = p.add_run()
        adding_break.add_break()
        document.save(sumydir + '\\' + 'summarization.docx')
        speak.Speak("Summarization was saved to the following path")

        #        f.write('\n')
        #        f.write(str(idx))
        #        f.write('.  ')
        #        f.write(sent)
        #    f.close()
        easygui.msgbox(msg='Sumarized file saved in this file ' + sumydir +
                       '\\' + 'summarization.docx',
                       title='Summarization')
    except Exception as e:
        speak.Speak(
            'Sorry My mistake please provide your feedback regarding this error'
        )
        easygui.exceptionbox(str(e))
Example #54
0
        try:
            news_response = urlopen(news_req)
        except urllib.error.HTTPError:
            pass
        try:
            news_soup = BeautifulSoup(news_response, features='html.parser')
        except http.client.IncompleteRead:
            pass
        article = news_soup.find_all('p')
        final_result = ''
        for i in range(len(article) - 1):
            final_result += article[i].text + " "
        file_name = ticker + '-' + str(index) + '.txt'
        parser = PlaintextParser.from_string(str(final_result), Tokenizer('english'))
        stemmer = Stemmer('english')
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words('english')
        print('writing: ' + file_name)
        f = open(file_name, "a")
        for sentence in summarizer(parser.document, 2):
            f.write(str(sentence))
        f.close()








Example #55
0
def summarize(request):
    """Responds to any HTTP request.
    Args:
        request (flask.Request): HTTP request object.
    Returns:
        The response text or any set of values that can be turned into a
        Response object using
        `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
    """
    # request_json = request.get_json()
    # if request.args and 'message' in request.args:
    #     return request.args.get('message')
    # elif request_json and 'message' in request_json:
    #     return request_json['message']
    # else:
    #     return f'Hello World!'
    try:
        if request.method == 'OPTIONS':
            # Allows GET requests from any origin with the Content-Type
            # header and caches preflight response for an 3600s
            headers = {
                'Access-Control-Allow-Origin': '*',
                'Access-Control-Allow-Methods':
                'GET, POST, PUT, PATCH, DELETE, OPTIONS',
                'Access-Control-Allow-Headers':
                'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization',
                'Access-Control-Expose-Headers':
                'Content-Length,Content-Range',
                'Access-Control-Max-Age': '3600'
            }
            return ('', 204, headers)

        headers = {
            'Access-Control-Allow-Origin': '*',
        }
        request_json = request.get_json()
        document = request_json['value']
    except:  #for local try using py main.py
        headers = None
        document = request['value']
    finally:

        parser = PlaintextParser.from_string(document, Tokenizer("english"))

        summaries = {}
        number_pool = [0, 1, 2, 3]
        random.shuffle(number_pool)
        print(number_pool)

        for i in range(len(number_pool)):
            if number_pool[i] == 0:
                summarizer = LexRankSummarizer()
            if number_pool[i] == 1:
                summarizer = LuhnSummarizer()
            if number_pool[i] == 2:
                summarizer = LsaSummarizer(Stemmer("english"))
                summarizer.stop_words = get_stop_words("english")
            if number_pool[i] == 3:
                summarizer = PureNLTKSummarizer()

            summary = summarizer(parser.document, 3)
            sum_string = []
            for sentence in summary:
                sum_string.append(str(sentence))
            summaries[f'{i}'] = " ".join(sum_string)
        if headers is None:
            return summaries
        return (summaries, 200, headers)
Example #56
0
    return summary
    

# %%
df = pd.read_pickle('cnn_dataset_10k.pkl')

# %%
df['summary_LexRank'] = ''
df['summary_Luhn'] =  ''
df['summary_LSA'] = ''

# %%
lex_summarizer = LexRankSummarizer()
luhn_summarizer = LuhnSummarizer()
lsa_summarizer = LsaSummarizer()
rouge = Rouge()

for i, r in df.iterrows():
    # print(df['text'].iloc[i])
    parser = PlaintextParser.from_string(df['text'].iloc[i], Tokenizer("english"))
    sentence_amount = 5 

    sentences = lex_summarizer(parser.document, sentence_amount) 
    df['summary_LexRank'].iloc[i] = append_summaries(sentences)
    # print(append_summaries(sentences))
    # print(sentences)

    sentences = luhn_summarizer(parser.document, sentence_amount) 
    df['summary_Luhn'].iloc[i] = append_summaries(sentences)
    # print(append_summaries(sentences))
	def get(self,request):
		query = request.GET['query']
		query = query.lower()
		query = re.sub(r'[^\w\s]','',query)

		response_json = {}
		fact_check = requests.get('https://factchecktools.googleapis.com/v1alpha1/claims:search',params = {'query':query,'key':api_key,'languageCode':'en-US'})
		db = client["news"]
		if len(fact_check.json()) == 0:
			response_json['Common Myths'] = [{'source':'No Results Available for this query','check':'Not Available','claim':'Not Available'}]

		else:
			claims = fact_check.json()['claims']
			ratings = [claims[i]['claimReview'][0]['textualRating'] for i in range(0,len(claims))]
			factcheck = None
			for rating in ratings:
				if rating == 'False' or 'myth' in rating or 'no evidence' in rating:
					factcheck = False
					
			if factcheck == False:
				response_json['Common Myths'] = []
				for claim in claims:
					current_result = {}
					current_result['source'] = claim['claimReview'][0]['url']
					current_result['check'] = claim['claimReview'][0]['textualRating']
					current_result['claim'] = claim['text']
					response_json['Common Myths'].append(current_result)

			else:
				response_json['Common Myths'] = [{'source':'No Results Available for this query','check':'Not Available','claim':'Not Available'}]
		stored_queries = db["news"].find({'_id':query})
		stored_result = []
		for q in stored_queries:
			stored_result.append(q)
		is_stored = None
		if len(stored_result)==0:	
			is_stored = False
		else:
			is_stored = True
		if is_stored == True:
			if request.GET['update'] == 'True':
				update_db.after_response(stored_result,db,query)
			response_json["News"] = []
			query_json= stored_result[0]
			for news in query_json["News"]:
				latest_news = news[-1]
				current_dict = {}
				current_dict["source"] = latest_news["source"]
				current_dict["content"] = latest_news["content"]
				response_json["News"].append(current_dict)
			update_faq(query)
			response_json["similar_questions"] = related_questions(query)
			response_json["summary"] = query_json["summary"]
			response_json["hit_again"] = 'True'
			return Response(response_json)

		result = resource.list(q= query, cx = search_engine_id).execute()
		if len(result) == 0 or 'items' not in result:
			response_json['News'] = [{'source':'No Results Available for this query','content':'Not Available'}]
		else:
			url = None
			extractor = extractors.ArticleExtractor()
			response_json['News'] = []
			content_summary = ''
			if is_stored == False:
				for item in result['items']:
					try:
						url = item['link']
						if 'pdf' in url or 'xml.gz' in url:
							continue
						
						if url == 'https://www.cdc.gov/coronavirus/2019-ncov/faq.html' or url=='https://www.cdc.gov/coronavirus/2019-ncov/hcp/faq.html':
							page = requests.get("https://www.cdc.gov/coronavirus/2019-ncov/faq.html")
							soup = BeautifulSoup(page.content, 'html.parser')
							page_results= soup.find_all('div',attrs={'class': 'card bar'})
							for content in page_results:
								question = content.find('span',attrs = {'role':'heading'}).contents[0]
								question = question.lower()
								re.sub(r'[^\w\s]','',question)
								question = question
								answer = content.find('div',attrs = {'class':'card-body'}).find('p').getText()
								
								if len(answer)!=0 and is_similar(query,question,0.5):
									current_result = {}
									current_result['source'] = url
									current_result['content'] = []
									#print(question,":",answer)
									current_result['content'].append(answer)
									response_json['News'].append(current_result)
									content_summary = content_summary + answer

						else:
							response = requests.get(url)
							stemmer = Stemmer(language=LANGUAGE)
							summarizer = Summarizer(stemmer)
							summarizer.stop_words = get_stop_words(LANGUAGE)
							parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
							summary = summarizer(parser.document, 5)
							summary = '\n'.join([line._text for line in summary])
							current_result = {}
							current_result['source'] = url
							current_result['content'] = []
							current_result['content'].append(summary)
							content_summary = content_summary + summary
							if 'Last-Modified' in response.headers:
								current_result['last_modified'] = response.headers['Last-Modified']
							else:
								current_result['last_modified'] = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime())

							response_json['News'].append(current_result)
					

					except urllib.error.HTTPError as e:
						current_result['content'] = ["No results available"]
						continue

					except TypeError:
						current_result['content'] = ["No results available"]
						continue

					except AttributeError:
						current_result['content'] = ["No results available"]
						continue
					except requests.exceptions.SSLError as e:
						current_result['content'] = ["No results available"]
						continue

				response_json['summary'] = get_summary(content_summary)
				db_json = {}
				db_json['News'] = response_json['News']
				db_json['summary'] = response_json['summary']
				for i,news in enumerate(db_json['News']):
					url = news['source']
					response = requests.get(url)
					headers = response.headers
					last_modified = None
					if 'Last-Modified' in headers:
						last_modified = headers['Last-Modified']
					else:
						last_modified = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime()) 
					db_json['News'][i]['last_modified'] = last_modified
					
				db_json['News'] = [[json] for json in db_json['News']]
				db_json['_id'] = query
				db["news"].insert_one(db_json)
				update_faq(query)
				response_json["similar_questions"] = related_questions(query)
				response_json["hit_again"] = 'False'
				return Response(response_json)
Example #58
0
    def test_empty_document(self):
        document = build_document()
        summarizer = LsaSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 0)
Example #59
0
txtSummary.write("\n\n*** LEXRANK NEGATIVE ***\n")
print("*** LEXRANK NEGATIVE ***")
for sentence in summary:
    txtSummary.write(str(sentence))
    print(sentence)

LSummarizer = LuhnSummarizer()
summary = LSummarizer(parser.document, 1)
txtSummary.write("\n\n*** LUHN NEGATIVE ***\n")
print("")
print("*** LUHN NEGATIVE ***")
for sentence in summary:
    txtSummary.write(str(sentence))
    print(sentence)

LSASummarizer = LsaSummarizer()
summary = LSASummarizer(parser.document, 1)
txtSummary.write("\n\n*** LSA NEGATIVE ***\n")
print("")
print("*** LSA NEGATIVE ***")
for sentence in summary:
    txtSummary.write(str(sentence))
    print(sentence)

LSA2Summarizer = LsaSummarizer()
LSA2Summarizer = LsaSummarizer(Stemmer("english"))
LSA2Summarizer.stop_words = get_stop_words("english")
txtSummary.write("\n\n*** LSA W/ STOP WORDS NEGATIVE ***\n")
print("")
print("*** LSA W/ STOP WORDS NEGATIVE ***")
for sentence in LSA2Summarizer(parser.document, 1):
Example #60
0
def textteaser_test():

    summary = open("summary_list.txt", "a", encoding='utf-8-sig')
    sys.stdout = summary

    # obtain the input article from url
    #url = "http://www.nytimes.com/2016/11/17/us/politics/donald-trump-administration-twitter.html?ref=politics"
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # obtain the input article from plain text files
    parser = PlaintextParser.from_file("input_sample.txt", Tokenizer(LANGUAGE))

    # define the language, by dafult it is English
    stemmer = Stemmer(LANGUAGE)

    # SumBasic algorithm
    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("SumBasic:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LSA algorithm
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("Latent Semantic Analysis:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # TextRank algorithm
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("TextRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LexRank algorithm
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("LexRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    #Featured-LexRank algorithm
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        first_line = f.readline()
    title = first_line
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        text = f.read()
    tt = TextTeaser()

    sentences = tt.summarize(title, text)
    file = open("tt.txt", "w", encoding='utf-8-sig')
    print("Featured-LexRank:")
    for sentence in sentences:
        file.write("%s\n" % sentence)
    file.close()

    parser = PlaintextParser.from_file("tt.txt", Tokenizer(LANGUAGE))
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    summary.close()