Ejemplo n.º 1
0
def main(url, num_sentences=10, language='english'):
	parser = HtmlParser.from_url(url, Tokenizer(language))
	stemmer = Stemmer(language)
	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(language)
	for sentence in summarizer(parser.document, num_sentences):
		print(sentence)
Ejemplo n.º 2
0
Archivo: iatv.py Proyecto: mtpain/iatv
def summarize(text, n_sentences, sep='\n'):
    '''
    Args:
        text (str or file): text itself or file in memory of text
        n_sentences (int): number of sentences to include in summary

    Kwargs:
        sep (str): separator to join summary sentences

    Returns:
        (str) n_sentences-long, automatically-produced summary of text
    '''

    if isinstance(text, str):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif isinstance(text, file):
        parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
    else:
        raise TypeError('text must be either str or file')

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
Ejemplo n.º 3
0
def summarizeFile(inputFile):
	summarizer = LsaSummarizer(stem_word)
	summarizer.stop_words = get_stop_words("english")
	url = findURLS(inputFile)
	if url != None:
		if url[-1] == '.':
			url = url[0:-1]
		#print (url)
		#urlContent = 'Summary from URL ['+url+']: \n'
		urlContent = ''
		try:
			parser = HtmlParser.from_url(url, Tokenizer("english"))		
			for sentence in summarizer(parser.document, 3):
				urlContent = urlContent + str(sentence) + '\n'
		except:
			#print (sys.exc_info()[0])
			urlContent = ''
	content = inputFile.read()
	parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
	#summarizer = LsaSummarizer(stem_word)
	#summarizer.stop_words = get_stop_words(LANGUAGE)
	#summary = 'Event Summary: \n'
	summary = ''
	try:
		for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
			summary = summary + str(sentence) + '\n'
	except AssertionError:
		return None
	if url != None:
		return summary + urlContent
	return summary
Ejemplo n.º 4
0
def summarize(string, summary_length = 1, language = "english"):
    string = string.lower() if string.isupper() else string
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)

    return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)]) 
Ejemplo n.º 5
0
def summarize(text):
    total = ""
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        total += str(sentence)
    return total
Ejemplo n.º 6
0
def lsa(comment,parser,num):
	summarizer = LsaSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)

	LSAstr = ''
	for sentence in summarizer(parser.document,num):
		LSAstr += str(sentence)

	return LSAstr
Ejemplo n.º 7
0
 def summary(self, int1, int2):
     # int1, int2 are the places between which to look for
     # the summary to be taken (slicing the corpus as a string)
     parser = PlaintextParser(self.corpus[int1:int2], Tokenizer("english"))
     summarizer = LsaSummarizer(stem_word)
     summarizer.stop_words = get_stop_words("english")
     self.summary_text = " ".join(
         map(lambda x:x._text,
             summarizer(parser.document, 20)))
     return self.summary_text
def summarize(filename, num_sentences):
    with open (filename, "r") as myfile:
        data=myfile.read()
    parser = PlaintextParser.from_string(data, Tokenizer('english')) 
    summarizer = LsaSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("english")
    summary = ""
    for sentence in summarizer(parser.document, num_sentences):
        summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " 
    return summary
Ejemplo n.º 9
0
def retreive_sumy(url):
    # "http://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)	
    return summarizer(parser.document, SENTENCES_COUNT)
Ejemplo n.º 10
0
def summarize(content):
    parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    text = '\n'.join(
        [str(sentence) for sentence in summarizer(parser.document, COUNT)]
    )
    summary = Summary(content=content, summary=text)
    summary.save()
Ejemplo n.º 11
0
 def summarizeText(self, body, numSentences = 10):
     """Summarizes body of text to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
Ejemplo n.º 12
0
    def lsa(self,text_parser):
        assert isinstance(text_parser,plaintext.PlaintextParser)

        #process the text
        summarizer=LSA()
        #EnglishStemmer())
        #summarizer.stop_words=stopwords.words("english")

        #we have to specify stop words
        summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE)
        return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
Ejemplo n.º 13
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"),
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
Ejemplo n.º 14
0
def test_article_example():
    """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
    parser = PlaintextParser.from_string(
        load_resource("articles/prevko_cz_1.txt"),
        Tokenizer("czech")
    )
    summarizer = LsaSummarizer(Stemmer("czech"))
    summarizer.stop_words = get_stop_words("czech")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
Ejemplo n.º 15
0
def summarize(text):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    result = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result += str(sentence) + " "

    return result
Ejemplo n.º 16
0
def summary(text):

    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    short = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n"
        #print(sentence)
    return short
Ejemplo n.º 17
0
def summarize(parser, sentences_count):
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = ""
    for sentence in summarizer(parser.document, sentences_count):
        sentences += " " + str(sentence)

    return sentences
Ejemplo n.º 18
0
    def test_issue_5_sigma_can_multiply_matrix_v(self):
        """Source: https://github.com/miso-belica/sumy/issues/5"""
        parser = PlaintextParser.from_string(
            load_resource("articles/sigma_can_multiply_matrix_v.txt"),
            Tokenizer("english")
        )
        summarizer = LsaSummarizer(english_stemmer)
        summarizer.stop_words = get_stop_words("english")

        sentences = summarizer(parser.document, 20)
        self.assertEqual(len(sentences), 20)
Ejemplo n.º 19
0
 def summarizeUrl(self, url, numSentences = 10):
     """Summarizes text at a given url to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = HtmlParser.from_url(url, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
     
Ejemplo n.º 20
0
def get_summary(text, max_sentences=5):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	stemmer = Stemmer("english")

	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words("english")

	summary = []
	for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10
		summary.append(str(sentence._text.encode('ascii', 'ignore')))

	return summary
Ejemplo n.º 21
0
def lsaReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = LsaSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
Ejemplo n.º 22
0
def test_issue_5_svd_converges():
    """Source: https://github.com/miso-belica/sumy/issues/5"""
    pytest.skip("Can't reproduce the issue.")

    parser = PlaintextParser.from_string(
        load_resource("articles/svd_converges.txt"),
        Tokenizer("english")
    )
    summarizer = LsaSummarizer(Stemmer("english"))
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
Ejemplo n.º 23
0
    def test_dictionary_without_stop_words(self):
        summarizer = LsaSummarizer()
        summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

        document = build_document(
            ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
            ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
            ("Some relevant sentence", "Some moRe releVant sentEnce",),
        )

        expected = frozenset(["some", "more", "relevant", "sentence"])
        dictionary = summarizer._create_dictionary(document)
        self.assertEqual(expected, frozenset(dictionary.keys()))
Ejemplo n.º 24
0
def sum_spark(doc):

    parser = PlaintextParser.from_string(doc,Tokenizer('english'))

    summarizer = Summarizer(Stemmer('english'))
    summarizer.stop_words = stop_books
    
    texts=[]

    for sentence in summarizer(parser.document, 2):
        texts.append(str(sentence))

    return texts
Ejemplo n.º 25
0
def extract_titles (reviews):

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for item_id, review in reviews.iteritems():
        print "Review: {}".format(review)
        print "\n"
        #sentences = re.split(r' *[\.\?!][\'"\)\]]* *', review)

        for sentence in summarizer(build_document_from_string(review), SENTENCES_COUNT):
            print sentence
        print "\n"
Ejemplo n.º 26
0
    def getText(self, sentence_count=None):
        if sentence_count:
            self.SENTENCE_COUNT = sentence_count
        parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        text_list = []

        for sentence  in summarizer(parser.document, self.SENTENCE_COUNT):
            text_list.append(str(sentence))
        return "\n".join(text_list)
Ejemplo n.º 27
0
def summarize_text(textbody):
    parser = PlaintextParser.from_string(textbody, Tokenizer(LANG))
    stemmer = Stemmer(LANG)

    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANG)

    summary = summarizer(parser.document, SENTENCE_COUNT)

    summarized_text = ''
    for sentence in summary:
        summarized_text += str(sentence) + ' '

    return summarized_text
Ejemplo n.º 28
0
    def get_smry(self, input):
        smry_list = {}
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        parser = PlaintextParser.from_file(input, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
    
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        i = 0
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)
            smry_list[str(i)] = str(sentence)
            i = i + 1
        return smry_list
Ejemplo n.º 29
0
def summarize():
    rows = store.get_row_by_status(1)

    for row in rows:
        parser = PlaintextParser.from_string(row["content_origin"], Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = list()

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentences.append(str(sentence))

        summary = "\n".join(sentences)

        store.update_row(row["id"], {"summary_origin": summary, "status": 2})
Ejemplo n.º 30
0
def summarize(text, language="english", count=5):
    """
    text (str):
    language (str):
    count (int):
    """
    summary = []
    text_file = text
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    stemmer = Stemmer(language)
    # or for plain text files
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)

    for sentence in summarizer(parser.document, count):
        summary.append(sentence)
    return summary
Ejemplo n.º 31
0
def summarize(
        srt_file,
        n_sentences,
        language="english"):  #Summarizes Text file using LSA Summarization

    parser = PlaintextParser.from_string(srt_to_txt(srt_file),
                                         Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = LsaSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(language)
    segment = []
    f = open("xd.txt", "w+")
    fo = open("dx.txt", "w+")

    for sentence in summarizer(parser.document, n_sentences):
        f.write(str(sentence))
        f.write('\n')
        index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
        item = srt_file[index]
        segment.append(srt_segment_to_range(item))
    return segment
Ejemplo n.º 32
0
def sumySummarize(url, SENTENCES_COUNT):
    fullSummary = []
    LANGUAGE = 'english'
    # SENTENCES_COUNT = 8
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # parser = HtmlParser.from_string(text, "http://clarifai.com", Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    # for sentence in summarizer(parser.document, SENTENCES_COUNT):
    #     summary.append(sentence)
    summary = summarizer(parser.document, SENTENCES_COUNT)
    summary = list(summary)
    # print summary.__str__()
    for s in summary:
        fullSummary.append(s.__str__())

    return " ".join(fullSummary)
Ejemplo n.º 33
0
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    #codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    #return text
    f = open("demofile3.txt", "w", encoding='utf-8')
    f.write(text)
    f.close()

    parser = PlaintextParser.from_file('demofile3.txt', Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summary = summarizer(parser.document, SENTENCES_COUNT)
    for sentence in summary:
        sentence = ''.join(map(str, summary))
    return (sentence)
Ejemplo n.º 34
0
def json_example():
    req_data = request.get_json()
    full_text = req_data['full_text']
    num_of_sentences = req_data['num_of_sentences']
    SENTENCES_COUNT = num_of_sentences
    sentiment_value = TextBlob(full_text).sentiment.polarity

    parser = PlaintextParser.from_string(full_text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summary = ''
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary += str(sentence) + '\n \n'

    return app.response_class(response=json.dumps({
        "sentiment_value": sentiment_value,
        "summary": summary
    }),
                              status=200,
                              mimetype='application/json')
Ejemplo n.º 35
0
def get_summary(article, url=False, num_sentence=NUM_SUMMARY_SENTENCE):
    """
    get the summary of one article
    :param num_sentence: number of sentence left for summary
    :param article: html string of the article or the url of the article
    :param url: True is article is an url
    :return: the summary of the article as string
    """
    if url:
        parser = HtmlParser.from_url(article, tokenizer=Tokenizer(LANGUAGE))
    else:
        parser = HtmlParser.from_string(article,
                                        tokenizer=Tokenizer(LANGUAGE),
                                        url=None)
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summ_sents = summarizer(parser.document, num_sentence)
    summary = " ".join([str(s).strip() for s in summ_sents])

    return summary
Ejemplo n.º 36
0
def test_dictionary_without_stop_words():
    summarizer = LsaSummarizer()
    summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

    document = build_document(
        (
            "stop halt shut hmmm",
            "Stop Halt Shut Hmmm",
        ),
        (
            "StOp HaLt ShUt HmMm",
            "STOP HALT SHUT HMMM",
        ),
        (
            "Some relevant sentence",
            "Some moRe releVant sentEnce",
        ),
    )

    expected = frozenset(["some", "more", "relevant", "sentence"])
    dictionary = summarizer._create_dictionary(document)

    assert expected == frozenset(dictionary.keys())
    def create_album_summaries(self):

        artists = Artist.objects.filter(artist_name="Immortal Technique")

        for artist_object in artists:
            all_albums_by_artist = Album.objects.filter(artist_object=artist_object.id)

            for album_object in all_albums_by_artist:
                songs_on_album = Song.objects.filter(album_object=album_object.id)

                for song_object in songs_on_album:
                    stemmer = Stemmer(LANGUAGE)
                    summarizer = Summarizer(stemmer)
                    summarizer.stop_words = get_stop_words(LANGUAGE)

                    if len(song_object.song_lyrics) > 0:

                        song_lyrics = song_object.song_lyrics[0]
                        parser = PlaintextParser.from_string(song_lyrics, Tokenizer(LANGUAGE))

                        for sentence in summarizer(parser.document, SENTENCES_COUNT):
                            print(sentence), song_object.song_title
                            # raw_input("Press Enter to continue...")
Ejemplo n.º 38
0
def summarize(srt_file, n_sentences, language="english"):
    """ Generate segmented summary

    Args:
        srt_file(str) : The name of the SRT FILE
        n_sentences(int): No of sentences
        language(str) : Language of subtitles (default to English)

    Returns:
        list: segment of subtitles

    """
    parser = PlaintextParser.from_string(srt_to_txt(srt_file),
                                         Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)
    segment = []
    for sentence in summarizer(parser.document, n_sentences):
        index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
        item = srt_file[index]
        segment.append(srt_segment_to_range(item))
    return segment
Ejemplo n.º 39
0
 def summarize(self, summarizer_type, max_sentences, document = ""):
     if self.document == "":
         target_document = document
     else:
         target_document = self.document
     # Spacing
     _target_document = ""
     sentence_list = self.pro.sentence_splitter(target_document)
     for sentence in sentence_list:
         _target_document += sentence + " "
     _target_document = _target_document.strip()
     # TextRank
     if summarizer_type == "textrank":
         self.result_list = summarize(_target_document, ratio=0.3, word_count=None, split=True)[:max_sentences]
     # PyTextRank
     elif summarizer_type == "lsa":
         parser = HtmlParser.from_string(_target_document, None,tokenizer=Tokenizer("english"))
         stemmer = Stemmer("english")
         summarizer = LsaSummarizer(stemmer)
         summarizer.stop_words = get_stop_words("english")
         summarized_sentence_list = summarizer(parser.document, max_sentences)
         self.result_list = [str(sentence) for sentence in summarized_sentence_list]
     return self.result_list
Ejemplo n.º 40
0
def groupme_bot():
    sentenceList = []
    group = Group.list().first
    messages = group.messages()
    message = str(messages.newest)

    regex = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"
    #regex = r"(\s*(.+?)(?:\s+(\d+)(?:(?:\s+\(?of\s+|-)(\d+)\)?)?)?|(\w+)): (https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-ZA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"

    bot = Bot.list().first

    LANGUAGE = "english"
    SENTENCES_COUNT = 3

    matches = re.finditer(regex, message)

    bot.post("Beginning the TL;DR summary:")
    for matchNum, match in enumerate(matches):
        matchNum += 1
        url = str(match.group(1))
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        start = default_timer()

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentenceList.append(str(sentence))

    print(sentenceList)
    bot.post(
        str(sentenceList).replace("[", "").replace("]", "").replace(
            "'", "").replace("\\n", " ").replace(".,", "."))
    duration = default_timer() - start
    bot.post("Time to complete this TL;DR summary: " +
             '{:.2f}'.format(float(duration)) + " seconds")
    print("Successfully completed!")
Ejemplo n.º 41
0
def process(path, filename):
    #print("Cleaning "+path)
    #print (path)
    filename = DATA_FOLDER + filename.strip()
    WRITE_HANDLER = open(filename, 'w')
    LANGUAGE = "english"
    file = path  #name of the plain-text file
    parser = PlaintextParser.from_file(file, Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    #summarizer = LuhnSummarizer()

    #summary = summarizer(parser.document, 4) #Summarize the document with 5 sentences

    #new_line = ""
    #for line in open(path, 'r'):
    #	new_line += line;
    #summary = summarize(str(new_line), word_count=50) #Summarize the document with max 100 words
    for sentence in summarizer(parser.document, 5):
        WRITE_HANDLER.write(str(sentence) + '\n\n')
Ejemplo n.º 42
0
def summarise(blob):

    ratio = math.ceil(len(blob.words)/50000)  # round up
    print("Ratio (words/50k):\t", ratio)

    LANGUAGE = "english"
    SENTENCES_COUNT = int(len(blob.sentences)/ratio)
    print("Number of sentences:\t", len(blob.sentences))
    print("Number to keep:\t\t", SENTENCES_COUNT)

    parser = PlaintextParser.from_string(str(blob), Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    new_sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        new_sentences.append(str(sentence))
        # We've lost paragraph breaks, throw some back in
        if random.random() < 0.5:
            new_sentences.append("\n\n")

    return TextBlob(" ".join(new_sentences))
Ejemplo n.º 43
0
def demo(request):
    f = 0
    form = demoform(request.POST or None)
    st = ""
    if form.is_valid():
        f = 1
        cd = form.cleaned_data
        text = cd.get('text')
        count = cd.get('count')
        print(text)
        print(count)
        import sys
        import os
        from sumy.parsers.html import HtmlParser
        from sumy.parsers.plaintext import PlaintextParser
        from sumy.nlp.tokenizers import Tokenizer
        from sumy.summarizers.lsa import LsaSummarizer as Summarizer
        from sumy.nlp.stemmers import Stemmer
        from sumy.utils import get_stop_words
        LANGUAGE = "english"
        print(sys.argv)
        # st = "dhasvfasvvfdkiqhafcb ahksfbasfjabdhkc aksjbfasj"
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        for sentence in summarizer(parser.document, count):
            print(type(sentence))
            print(sentence)
            st += str(sentence) + "\n"

    return render(request, 'Summarizer/demo.html', {
        'form': form,
        'st': st,
        'f': f
    })
Ejemplo n.º 44
0
  def _parse_url(self, url):
    '''
    Takes in a url, and returns its language, keywords, and summary
    '''
    try:
      text = extractText(url)
      language = languages.get(alpha_2=detect(text)).name
      parser = HtmlParser.from_url(url, Tokenizer(language))
      stemmer = Stemmer(language)
      summarizer = Summarizer(stemmer)
      summarizer.stop_words = get_stop_words(language)
      summary = ''
      for sentence in summarizer(parser.document, Article._SENTENCE_COUNT):
        summary += str(sentence)

      r = Rake(language, max_length=3)
      r.extract_keywords_from_text(text)
      keywords = r.get_ranked_phrases()[:10]

      return language, keywords, summary

    except Exception as e:
      logging.error(e)
      return False
def get_summaries_from_list_of_abstracts(list_of_abstracts, summarizer_type):

    if summarizer_type == 'lsa':
        summarizer = LsaSummarizer(Stemmer("english"))
    elif summarizer_type == 'luhn':
        summarizer = LuhnSummarizer(Stemmer("english"))
    elif summarizer_type == 'lexrank':
        summarizer = LexRankSummarizer(Stemmer("english"))
    elif summarizer_type == 'textrank':
        summarizer = TextRankSummarizer(Stemmer("english"))

    summarizer.stop_words = get_stop_words("english")

    list_of_summaries = []

    for abstract in list_of_abstracts:
        parser = PlaintextParser(abstract, Tokenizer("english"))
        summary = summarizer(parser.document, 3)
        summary_string = " ".join(map(str, summary))
        list_of_summaries.append(summary_string)

    print(list_of_summaries)

    return list_of_summaries
Ejemplo n.º 46
0
    def __summarize(self, content, parser):
        stemmer = Stemmer(self.lang)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(self.lang)

        total = int(self.num_sentences * self.correction_ratio)
        counter = 0

        summary = []

        for sentence in summarizer(parser.document, self.num_sentences):
            text = str(sentence)
            text = text.replace('\n', ' ').replace('\r', '')
            text = re.sub(r'[^\x00-\x7f]', r' ', text)
            text = re.sub("\s+", " ", text)

            summary.append(text)

            counter = counter + 1

            if counter > total:
                break

        return summary
Ejemplo n.º 47
0
 def lsa_summarize(self):
     summarizer = LsaSummarizer()
     summarizer.stop_words = self.stop_words
     summary_tuple = (summarizer(self.parser.document, 4))
     lsa_summary = " ".join(map(str, summary_tuple))
     return lsa_summary
Ejemplo n.º 48
0
import nltk
import numpy
import requests
from bs4 import BeautifulSoup
from dateutil import parser
from newspaper import Article
from scipy.cluster.vq import kmeans2
from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.utils import get_stop_words

SUMMARIZER = Summarizer(Stemmer('english'))
SUMMARIZER.stop_words = get_stop_words('english')

SENTENCE_COUNT = 4
SOURCES = dict()
SOURCES['entertainment'] = [
    'https://www.cnn.com/entertainment',
    'https://www.nytimes.com/section/t-magazine/entertainment',
    'https://www.foxnews.com/entertainment',
]
SOURCES['food'] = [
    'https://www.cnn.com/travel/food-and-drink',
    'https://www.nytimes.com/section/food',
    'https://www.wsj.com/news/life-arts/food-cooking-drink',
    'https://www.foxnews.com/food-drink'
]
SOURCES['local'] = [
Ejemplo n.º 49
0
def listen():
    button_text.set("Listening...")
    commentary = ""

    text_box = tk.Text(root, height=10, width=50, padx=15, pady=15)
    text_box.tag_configure("left", justify="left")
    text_box.tag_add("left", 1.0, "end")
    text_box.grid(column=0, row=3)

    punctuated_text = tk.Text(root, height=10, width=50, padx=15, pady=15)
    punctuated_text.insert(1.0, "SOON TO BE PUNCTUATED TEXT")
    punctuated_text.tag_configure("left", justify="left")
    punctuated_text.tag_add("left", 1.0, "end")
    punctuated_text.grid(column=1, row=3)

    summarized_text = tk.Text(root, height=10, width=50, padx=15, pady=15)
    summarized_text.insert(1.0, "SOON TO BE SUMMARIZED TEXT")
    summarized_text.tag_configure("left", justify="left")
    summarized_text.tag_add("left", 1.0, "end")
    summarized_text.grid(column=2, row=3)

    while True:
        with sr.Microphone(1) as source:
            print("Say something!")
            audio = r.listen(source)

        try:
            speech = r.recognize_google(audio)
            if "Corpus" in speech:
                break

            print("You said: " + speech)
            commentary = commentary + " " + speech

            text_box.delete(1.0, tk.END)
            text_box.insert(1.0, commentary)

        except sr.UnknownValueError:
            print("Speech Recognition didn't catch that")
        except sr.RequestError as e:
            print(
                "Could not request results from Speech Recognition service; {0}"
                .format(e))

    button_text.set("ADDING PUNCTUATION!")

    print("PRE-PUNCTUATION:", commentary, "\n")
    commentary = (fastpunct.punct([commentary], batch_size=32))
    print("PUNCTUATED", commentary, "\n")

    punctuated_text.delete(1.0, tk.END)
    punctuated_text.insert(1.0, commentary)

    button_text.set("SUMMARIZING!")

    parser = PlaintextParser.from_string(commentary, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    print("SUMMARIZED TEXT:")
    fullSummary = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        fullSummary.append(sentence)

    print(fullSummary)
    # new = ' '.join(fullSummary)
    # print(type(new))
    summarized_text.delete(1.0, tk.END)
    summarized_text.insert(1.0, fullSummary)

    button_text.set("Start Recording")
Ejemplo n.º 50
0
# In[13]:

import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# In[14]:

parser = PlaintextParser.from_string(rawdata.news[1], Tokenizer("english"))
stemmer = Stemmer("english")
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words("english")

for sentence in summarizer(parser.document, 6):
    print(sentence)

# In[16]:

testchinese = '温客行一眼就看出周子舒使用的是四季山庄的流云九宫步,狠狠教训了顾湘一顿,就带她离开了。张成岭看出周子舒有一身好武功,只是深藏不露,就主动过来和周子舒寒暄,还给他一块名帖,让他有事去镜湖山庄,张成岭着急给母亲买点心,就先行离开了。周子舒听到孩子们在唱那首五湖盟争夺武林盟主以及琉璃甲的歌谣,不禁感慨江湖的风云多变。周子舒叫醒岸边的摆渡船夫,他要乘船去镜湖山庄,摆渡船夫趁机狮子大开口,周子舒也不还价,摆渡船夫看他一副病恹恹的模样,不忍心敲诈他,温客行带顾湘及时赶来,主动提出送周子舒去镜湖山庄,摆渡船夫不依不饶,拉起周子舒就上船离开了。周子舒远远就发现镜湖山庄犹如人间仙境,他迫不及待赶过去,下船就忘了付钱,遭到摆渡船夫劈头盖脸一顿臭骂,周子舒索性就坐一次霸王船。周子舒施展轻功,很快就进入镜湖山庄的桃林,他沉醉于花香之中,温客行突然从背后偷袭,周子舒只能迎战,两个人交手几个回合,温客行对周子舒心生佩服,请他喝酒小叙,周子舒断然拒绝。周子舒来到镜湖山庄,从管家口中得知镜湖派掌门张玉森久不闻江湖事,他有三个儿子张成峰,张成峦和张成岭,也不许他们掺和江湖门派之争,管家把周子舒安顿到柴房,子时的时候,三秋钉又准时开始发作,周子舒感觉浑身疼痛难忍,只能发动全部功力为自己疗伤,突然听到外面人声嘈杂。周子舒打开门发现镜湖山庄已经变成一片火海,他飞身上屋顶观察,发现带着鬼面具的人在镜湖山庄大肆烧杀抢掠,怀疑是鬼谷的人所为,他立刻下去救人,张玉森,张成峦和张成峰父子三人被抓走,镜湖山庄的人几乎全部被杀,尸横遍野。摆渡船夫保护着张成岭想逃走,被鬼谷的人追杀,周子舒出手相救,掩护着他们俩乘船离开,远远看到温客行坐在华亭伤看热闹。周子舒把摆渡船夫和张成岭带到一间破庙,摆渡船夫说明张玉森曾经救过他的命,他在镜湖山庄门前摆渡三年,就是想等有朝一日报恩,摆渡船夫让张成岭去太湖找三白大侠,张成岭坚决不走。外面阴风阵阵,一群带鬼面具的人冲进来,一个自称吊死鬼的人叫嚣着进来抓张成岭,周子舒因为体力耗尽要静养半个时辰,摆渡船夫和吊死鬼战在一处,他渐渐体力不支被打翻在地,吊死鬼要杀了周子舒,张成岭拼命保护他,顾湘及时赶来,她和黑白无常大打出手,吊死鬼想杀张成岭,摆渡船夫奋不顾身护住他,被打成重伤。顾湘被恶鬼们团团包围,周子舒挣扎着跳起来为顾湘解围,把恶鬼们全部打跑,他因体力不支差点晕倒,温客行赶来抱住周子舒。摆渡船夫因为失血过多奄奄一息,温客行用内力帮他维持,船夫拜托周子舒把张成岭交给五湖盟的赵敬,还让张成岭当场给周子舒跪下磕头,周子舒满口答应,摆渡船夫说完这些话就咽气了。周子舒帮张成岭把摆渡船夫埋葬,张成岭累得精疲力尽,周子舒打算休息一夜再上路,温客行让顾湘生火,把干粮烤了给周子舒和张成岭,周子舒借口不饿不想吃,顾湘对他冷嘲热讽,张成岭也不吃顾湘的干粮,遭到顾湘的训斥,谴责他不知道报恩,张成岭连连向她赔礼道歉。温客行发现张成岭身受重伤,主动提出帮他医治,周子舒坚决不同意,两个人一言不合就大打出手。'
parser = PlaintextParser.from_string(testchinese, Tokenizer("chinese"))
stemmer = Stemmer("chinese")
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words("chinese")

for sentence in summarizer(parser.document, 6):
    print(sentence)
Ejemplo n.º 51
0
            news_response = urlopen(news_req)
        except urllib.error.HTTPError:
            pass
        try:
            news_soup = BeautifulSoup(news_response, features='html.parser')
        except http.client.IncompleteRead:
            pass
        article = news_soup.find_all('p')
        final_result = ''
        for i in range(len(article) - 1):
            final_result += article[i].text + " "
        file_name = ticker + '-' + str(index) + '.txt'
        parser = PlaintextParser.from_string(str(final_result), Tokenizer('english'))
        stemmer = Stemmer('english')
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words('english')
        print('writing: ' + file_name)
        f = open(file_name, "a")
        for sentence in summarizer(parser.document, 2):
            f.write(str(sentence))
        f.close()









from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import os
from google import google




def crossCheck(self, string)
url = "https://www.cbsnews.com/news/walmart-pulls-rope-tree-journalist-t-shirt-from-site/"
# or for plain text files
# parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer("english")

summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words("english")


parser = PlaintextParser.from_file(file, Tokenizer("english"))

for sentence in summarizer(parser.document, 1):
    print(sentence)
    sentence = str(sentence)

num_page = 1

search_results = google.search(sentence, num_page)
Ejemplo n.º 53
0
def run_LSA(stemmer, document):
    lsa = LsaSummarizer(stemmer)
    lsa.stop_words = get_stop_words(LANGUAGE)
    print("LSA")
    return [x for x in lsa(document, SENTENCES_COUNT)]
for file in os.listdir('datafiles'):
    with codecs.open('datafiles/' + file,
                     'r',
                     encoding='utf-8',
                     errors='ignore') as f:
        parser = PlaintextParser.from_string(f.read().replace('\n', ' '),
                                             UrduTokenizer)
        objectDocModel = parser.document
        print(objectDocModel.sentences)
        print(objectDocModel.paragraphs)
        print(objectDocModel.words)
        print(objectDocModel.headings)

        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words("Urdu")
        summ = summarizer(parser.document, SENTENCES_COUNT)
        with open('dataresults/' + file.split('.')[0] + '.txt', 'w') as fw:
            for sentence in summ:
                #print sentence
                evaluated_sentences.append(sentence)
                fw.writelines(str(sentence))
        #list of rouge scores (bigrams)
        res = rouge_1(evaluated_sentences, objectDocModel.sentences)
        rouge_scores.append(res)
        evaluated_sentences.clear()

        fw.close()
    f.close()

# Evaluation scores
def summarize(SENTENCES_COUNT):
    try:
        #    url = "https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/"
        speak.Speak("Please Enter the U r l for summarization")

        url = easygui.textbox(
            msg='Enter url for which you want summarization:',
            title='Summarization').split()[0]
        title = getTextFromURL(url)
        #    url="https://medium.com/@gurunathrajagopal/what-happens-when-machines-are-curious-to-learn-9aed6805bf36"
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        string_dict = {}
        for idx, sentence in enumerate(
                summarizer(parser.document, SENTENCES_COUNT)):
            #        f.write(str(sentence))
            string_dict[idx] = str(sentence)
    #        print(type(sentence))
    #    print(string_dict)


#        speak.Speak("Please Enter the filename to save to summarization")

#        file_name=easygui.textbox(msg='Enter filename to save the summarization:',title='Summarization').split()[0]
#        current_dir=os.getcwd()
#    f=open(current_dir+'\\'+str(file_name)+'.txt','w')
#    f.write('Summarization')
        document = Document()

        document.add_heading('Summarization of ' + str(title), 0)
        p = document.add_paragraph(
            'Summarizing your article in crisp {} points'.format(
                SENTENCES_COUNT))

        for idx, sent in zip(string_dict.keys(), string_dict.values()):
            adding_break = p.add_run()
            adding_break.add_break()
            p = document.add_paragraph(sent)
        adding_break = p.add_run()
        adding_break.add_break()
        document.save(sumydir + '\\' + 'summarization.docx')
        speak.Speak("Summarization was saved to the following path")

        #        f.write('\n')
        #        f.write(str(idx))
        #        f.write('.  ')
        #        f.write(sent)
        #    f.close()
        easygui.msgbox(msg='Sumarized file saved in this file ' + sumydir +
                       '\\' + 'summarization.docx',
                       title='Summarization')
    except Exception as e:
        speak.Speak(
            'Sorry My mistake please provide your feedback regarding this error'
        )
        easygui.exceptionbox(str(e))
Ejemplo n.º 56
0
def summarize():
    response.content_type = "application/json"
    incoming = request.json
    if incoming.get('token', None) != SECRET_TOKEN:
        return

    channel = incoming.get('channel_id', None)
    if channel is None:
        return

    query = incoming.get('text', '').replace('!tldr', '').strip()
    count = 150
    if query != "":
        try:
            count = int(
                query) if int(query) > 10 and int(query) < 1000 else count
        except ValueError:
            pass

    r = requests.post(API_URL + "login",
                      data=json.dumps({
                          "username": USERNAME,
                          "password": PASSWORD
                      }),
                      headers={"Content-type": "application/json"})

    try:
        user = r.json()
    except Exception as e:
        print("BAILING OUT (login):\n{}".format(e))
        return

    userdata = user.get('data', None)
    if userdata is None:
        print("Login failed")
        return

    uid = userdata.get('userId', None)
    authToken = userdata.get('authToken', None)

    if uid is None or authToken is None:
        print("uid or token was invalid")
        return

    r = requests.get(API_URL + \
                     "channels.history?roomId={}&count={}".format(channel, count),
                    headers={"X-Auth-Token": authToken,
                             "X-User-Id": uid})

    try:
        history = r.json()
    except Exception as e:
        print("BAILING OUT (history):\n{}".format(e))
        return

    last = history['messages'][1]  # 0 is !tldr
    if last.get('urls', []) != []:
        summaries = []
        for url in last['urls']:
            parser = HtmlParser.from_url(url['url'], Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)
            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)
            summaries.append(
                "> {}".format(" ".join([str(sentence)\
                 for sentence in summarizer(parser.document,
                                            SENTENCES_COUNT)])))
        output = json.dumps({"text": "\n--\n".join(summaries)})
        return output

    messages = ". ".join(
        [m['msg'] for m in history['messages'][::-1] \
         if m['msg'] != "" and m.get('bot', None) is None])

    parser = PlaintextParser.from_string(messages, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return json.dumps({
        "text":
        "\n--\n".join([
            "> {}".format(str(sentence))
            for sentence in summarizer(parser.document, SENTENCES_COUNT)
        ])
    })
Ejemplo n.º 57
0
def summarize(url, number = 5):
  parser = HtmlParser.from_url(url, Tokenizer("english"))
  stemmer = Stemmer("english")
  summarizer = Summarizer(stemmer)
  summarizer.stop_words = get_stop_words("english")
  return " ".join(str(i) for i in summarizer(parser.document, number))
Ejemplo n.º 58
0
def mySumD():
	if request.form['action'] == 'LSA':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		summarizer = Summarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'Luhn':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		summarizer = LuhnSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'LexRank':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		summarizer = LexSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'TextRank':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

		stemmer = Stemmer(LANGUAGE)
		summarizer = TextSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	elif request.form['action'] == 'SumBasic':
		para = ""
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

		stemmer = Stemmer(LANGUAGE)
		summarizer = SumSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
	
	else:
		para = ""
		request.form['action'] == 'KL-Sum'
		url = request.form['url_link']
		parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

		stemmer = Stemmer(LANGUAGE)
		summarizer = KLSummarizer(stemmer)
		summarizer.stop_words = get_stop_words(LANGUAGE)

		for sentence in summarizer(parser.document, SENTENCES_COUNT):
			data = str(sentence)
			para += data 
		return render_template('dependent.html', para = para)
Ejemplo n.º 59
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


LANGUAGE = "english"
SENTENCES_COUNT = 5


if __name__ == "__main__":
    # url = "https://en.wikipedia.org/wiki/Automatic_summarization"
    
    # this is where you would put the file name with all the content you would want to summarize.
    parser = PlaintextParser.from_file("content.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
def get_update():

    try: 
        from googlesearch import search 
    except ImportError:  
        print("No module named 'google' found") 
      
    # to search 
    query = "covid-19 google scholar" #google scholer,  Czech Republic

    update ={}
      
    for url in search(query, tld="co.in", num=10, stop=2, pause=2): 
        
        print(url)        
        web_response = requests.get(url) 
  
        # building 
        element_tree = lxml.html.fromstring(web_response.text) 
          
        tree_title_element = element_tree.xpath('//title')[0] 
          
        #print("Tag title : ", tree_title_element.tag) 
        print("\nText title :", tree_title_element.text_content()) 

        print("\n")
        
        #print("\nhtml title :", lxml.html.tostring(tree_title_element)) 
        #print("\ntitle tag:", tree_title_element.tag) 
        #print("\nParent's tag title:", tree_title_element.getparent().tag) 



        
        #url = "https://academic.oup.com/clinchem/advance-article/doi/10.1093/clinchem/hvaa029/5719336"
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        # print(summarizer._text)
        summarizer.stop_words = get_stop_words(LANGUAGE)


        sentence_list = []

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            #print(dir(sentence))
            # print(sentence._text)
            sentence_list.append(sentence._text)
        sentences = (" ".join(sentence_list))
        update[tree_title_element.text_content()] = sentences
        # print("\n")    
        # for i in range(0,len(sentence_list),1):    
        #   time.sleep(2)
        #   translations = translator.translate([sentence_list[i]], dest='bn')    
        #   for translation in translations:
        #     print(translation.text)
        #   #print(translation.origin, ' -> ', translation.text)

        # print("\n")

        # translations = []
        # for sentence in sentence_list:
        #   translations.append(translator.translate(,dest='bn'))
        
        # print(translations)
    return update