def summarize(self, input, num_sentences ): # TODO: allow the caller to specify the tokenizer they want # TODO: allow the user to specify the sentence tokenizer they want tokenizer = RegexpTokenizer('\w+') # get the frequency of each word in the input base_words = [word.lower() for word in tokenizer.tokenize(input)] words = [word for word in base_words if word not in stopwords.words()] word_frequencies = FreqDist(words) # now create a set of the most frequent words most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]] #making a list of the top 15 most frequently appearing words most_freq_w = word_frequencies.items()[:10] print "********************" print word_frequencies print "Most frequent words are......." print most_freq_w print "********************" #print most_freq_w.values() #generating a dictionary of the most frequently appearing words list dict_most_freq_words=dict([(k,v) for k,v in most_freq_w]) print dict_most_freq_words print "KEYS are :" print dict_most_freq_words.keys() # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') actual_sentences = sent_detector.tokenize(input) working_sentences = [sentence.lower() for sentence in actual_sentences] # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] #generate the respective IDF of the top 15 most frequently appearing words TFIDF = generateTfidf(dict_most_freq_words) print "GENERATED WORDS AFTER IDF ARE: " print TFIDF #now we need to multiply the IDF and TF to get thr TFIDF #TFIDF=dict() #TFIDF=dict([(n, dict_most_freq_words.get(n,0) * generated_words_IDF.get(n,0)) for n in set(dict_most_freq_words) | set (generated_words_IDF)]) print "TFIDF is" print TFIDF TFIDF_after_SORT = sorter(TFIDF) print "TFIDF after sorting is" print TFIDF_after_SORT highest_ranked_words_dict = dict() highest_ranked_words_dict = dict([(k,v) for k,v in TFIDF_after_SORT]) #TFIDF_after_SORT.keys() #print "highest_ranked_words are: " #print highest_ranked_words_dict highest_ranked_words = highest_ranked_words_dict.keys() #print highest_ranked_words for word in highest_ranked_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break # sort the output sentences back to their original order output_sentences = self.reorder_sentences(output_sentences, input) # concatinate the sentences into a single string return " ".join(output_sentences)
def summarize(self, input, num_sentences ): #Tokenize the words tokenizer = RegexpTokenizer('\w+') # get the frequency of each word in the input #Tokenize the input base_words = [] time1 = time.time() tokenized_words = tokenizer.tokenize(input.lower()) time2 = time.time() print "Total time taken for word tokenization is: " + str(time2 - time1) print "no of tokenized words are: " +str(len(tokenized_words)) #Frequence of words time1 = time.time() word_frequencies = FreqDist(tokenized_words) time2 = time.time() print "Total time taken for calculating word frequencies is: " + str(time2 - time1) #baseWords = word_frequencies.keys() #Take the 10 most frequent words excluding stop words time1 = time.time() word_lst = [] countValidWords = 0 for word in word_frequencies.items(): if word[0].lower() not in stopwords.words() and word[0].lower() != 'would': word_lst.append(word) countValidWords = countValidWords + 1 if countValidWords > 10: break time2 = time.time() print "Total time taken for stop words is: " + str(time2 - time1) #generating a dictionary of the most frequently appearing words list dict_most_freq_words=dict([(k,v) for k,v in word_lst]) #print dict_most_freq_words print "KEYS are :" print dict_most_freq_words.keys() # break the input up into sentences. working_sentences is used # for the analysis, but actual_sentences is used in the results # so capitalization will be correct. time1 = time.time() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') actual_sentences = sent_detector.tokenize(input) working_sentences = [sentence.lower() for sentence in actual_sentences] time2 = time.time() print "Time taken to tokenize sentences is: " +str(time2-time1) # iterate over the most frequent words, and add the first sentence # that inclues each word to the result. output_sentences = [] #generate the respective IDF of the top 15 most frequently appearing words time1 = time.time() TFIDF = generateTfidf(dict_most_freq_words) time2 = time.time() print "Time taken in IDF function is: " +str(time2-time1) print "TFIDF is" print TFIDF #Sort the TFIDF words time1 = time.time() TFIDF_after_SORT = sorter(TFIDF) time2 = time.time() print "Time taken in SORT function is: " +str(time2-time1) print "TFIDF after sorting is" print TFIDF_after_SORT #Create a dictionary of highest ranked words highest_ranked_words_dict = dict() highest_ranked_words_dict = dict([(k,v) for k,v in TFIDF_after_SORT]) highest_ranked_words = highest_ranked_words_dict.keys() #Collect the sentences from the input which contains the TFIDF words time1 = time.time() for word in highest_ranked_words: for i in range(0, len(working_sentences)): if (word in working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break time2 = time.time() print "Time taken to generate OP sentences is: " +str(time2-time1) # sort the output sentences back to their original order time1 = time.time() output_sentences = self.reorder_sentences(output_sentences, input) time2 = time.time() print "Time taken to reorder OP sentences is: " +str(time2-time1) # concatinate the sentences into a single string return " ".join(output_sentences)