def sum_long (long_input): Simple = SimpleSummarizer() return Simple.summarize(long_input, 4)
def main(): #navFlag is the flag set if 1 iteration is complete ttsEngine.speakTextSync("Hello!!! Welcome to speech recognition for the visually impaired") ttsEngine.speakTextSync("In this system, you can obtain information about your desired topic using a speech query") ttsEngine.speakTextSync("Control the tool, by closely following the various instructions") navFlag = 0 navLink = "" title = "" baseUrl = "" while 1: try: selectedLink = "" if navFlag == 0: #Operations is the speech recognition engine speechRecog = SpeechRecog() query = speechRecog.inputSpeech() print query ttsEngine.speakTextSync("Returned query is " + query) #Iterate through the links and get the selected link selectedLink = speechRecog.searchQuery(query) elif navFlag == 1 and navLink != "": ttsEngine.speakTextSync("Navigating to the requested link. Please wait.") navFlag = 0 selectedLink = navLink navLink = "" baseUrl = getBaseUrl(selectedLink) print baseUrl time1 = time.time() #-------------------------------------------------------- #Create an instance of the FetchHtml Class #-------------------------------------------------------- fetchCntnts = FetchHTMLContent(selectedLink) #-------------------------------------------------------- #Fetch the html contents from the selected page #-------------------------------------------------------- htmlCntnt = fetchCntnts.fetchUrlContents() #-------------------------------------------------------- #Fetch all the content embedded in the <p> tag #-------------------------------------------------------- html = fetchCntnts.fetchParaContent(htmlCntnt) #-------------------------------------------------------- #Remove all the invalid tags in the text #-------------------------------------------------------- removeTags=removeHTMLtags() html = removeTags.StripTags(html) #-------------------------------------------------------- #Fetch the valid required pure content and summarize #-------------------------------------------------------- txt = html time2 = time.time() time3=time.time() obj = SimpleSummarizer() newTxt = obj.summarize(txt,40) time4=time.time() timeTaken = time4 - time3 print "time taken for parsing raw html file is totally: " + str(time2-time1) print "Total time taken for summarization is: " + str(timeTaken) #-------------------------------------------------------- #Clean up the summarized text. Remove the javascript contents and write it to a file #-------------------------------------------------------- regExp=RegexpTokenizer('\w*;$') txtCleanUp=LineTokenizer().tokenize(newTxt) s="" a = "" for i in txtCleanUp: a=regExp.tokenize(i) if a: print a else: s = "" + i print s file1=open("summary.txt","w") file1.write(s) file1.close() #--------------------------------------------------------- #Block to control the reading of summarized contents #--------------------------------------------------------- ttsEngine.speakText("The summarized contents are: ") spkAndListen = speak_listen_summary() navFlag, navLink, title = spkAndListen.readSummary(baseUrl) if navLink == "": navFlag = 0 print "RETURNED" print navFlag print navLink print title #--------------------------------------------------------- except: print "Some error happened" time.sleep(5) ttsEngine.speakText("Odd!! Thr was sm error processing this page We are trying to restart your browser Do you want to continue?") print "Odd!! Thr was sm error processing this page We are trying to restart your browser Do you want to continue?" query = speech.input() print query if(query == "yes" or query == "YES" or query == "Yes"): continue else: sys.exit()
def main(): navFlag = 0 navLink = "" title = "" baseUrl = "" while 1: selectedLink = "" if navFlag == 0: #Operations is the speech recognition engine speechRecog = SpeechRecog() query = speechRecog.inputSpeech() print query ttsEngine.speakText("Returned query is " + query) #Iterate through the links and get the selected link selectedLink = speechRecog.searchQuery(query) elif navFlag == 1 and navLink != "": ttsEngine.speakTextSync("Navigating to the requested link. Please wait.") navFlag = 0 selectedLink = navLink navLink = "" baseUrl = getBaseUrl(selectedLink) print baseUrl fetchCntnts = FetchHTMLContent(selectedLink) htmlCntnt = fetchCntnts.fetchUrlContents() fetchCntnts.fetchParaContent(htmlCntnt) #Remove all invalid tags and write back to Bill.txt #-------------------------------------------------------- removeTags=removeHTMLtags() file=open("Bill.txt") html=file.read() file.close() html = removeTags.StripTags(html) file=open("Bill.txt","w") file.write(html) file.close() #-------------------------------------------------------- #Fetch the valid required pure content and summarize #-------------------------------------------------------- file=open('Bill.txt') txt=file.read() file.close() obj = SimpleSummarizer() newTxt = obj.summarize(txt,40) #-------------------------------------------------------- #Clean up the summarized text. Remove the javascript contents and write it to a file #-------------------------------------------------------- regExp=RegexpTokenizer('\w*;$') txtCleanUp=LineTokenizer().tokenize(newTxt) s="" a = "" for i in txtCleanUp: a=regExp.tokenize(i) if a: print a else: s = "" + i print s file1=open("summary.txt","w") file1.write(s) file1.close() #--------------------------------------------------------- ttsEngine.speakText("The summarized contents are: ") spkAndListen = speak_listen_summary() navFlag, navLink, title = spkAndListen.readSummary(baseUrl) if navLink == "": navFlag = 0 print "RETURNED" print navFlag print navLink print title
from reviewInfo.models import ReviewInfo from re import sub #import re from summarize import SimpleSummarizer def get_all_reviews(product_sku): all_reviews = ReviewInfo.objects.all().filter(sku=product_sku) review_list = [ sub('-', "", ((e.comment.lower()).encode('utf-8'))) for e in all_reviews ] return review_list samsung_reviews = get_all_reviews(5717547) #print len(samsung_reviews) for each_review in samsung_reviews: #print len(each_review) if len(each_review) > 500: print "----Original----" print each_review print "----summarized----" Simple = SimpleSummarizer() print Simple.summarize(each_review,2)
"-w", "--wiki", dest="wiki", action="store", default=None, help="Specifies the wiki to perform calculations against" ) parser.add_option( "-n", "--sents", dest="num_sents", action="store", default=5, help="Specifies the number of sentences to write" ) (options, args) = parser.parse_args() if options.id: query = "id:%s" % (options.id) elif options.wiki: query = "host:'%s' AND ns:0" % (options.wiki) else: raise Exception("A wiki or ID is required, passed as host name") conn = SolrConnection("http://search-s10.prod.wikia.net:8983/solr") response = conn.query(query, fields=["html_en", "nolang_txt", "html", "title", "title_en", "id"]) paginator = SolrPaginator(response) summarizer = SimpleSummarizer() for page in paginator.page_range: for doc in paginator.page(page).object_list: text = doc.get("html_en", doc.get("nolang_txt", doc.get("html"))) title = doc.get("title_en", doc.get("title", doc["id"])) summed = summarizer.get_summarized(text, options.num_sents) print "\t\t=======", title, "=======" print "\t" + "\n\t".join([sent for sent in summed if not sent.startswith("Contents")]) print "\t\t====================================="
action="store", default=5, help="Specifies the number of sentences to write") (options, args) = parser.parse_args() if options.id: query = 'id:%s' % (options.id) elif options.wiki: query = "host:'%s' AND ns:0" % (options.wiki) else: raise Exception('A wiki or ID is required, passed as host name') conn = SolrConnection('http://search-s10.prod.wikia.net:8983/solr') response = conn.query( query, fields=['html_en', 'nolang_txt', 'html', 'title', 'title_en', 'id']) paginator = SolrPaginator(response) summarizer = SimpleSummarizer() for page in paginator.page_range: for doc in paginator.page(page).object_list: text = doc.get('html_en', doc.get('nolang_txt', doc.get('html'))) title = doc.get('title_en', doc.get('title', doc['id'])) summed = summarizer.get_summarized(text, options.num_sents) print "\t\t=======", title, "=======" print "\t" + "\n\t".join( [sent for sent in summed if not sent.startswith('Contents')]) print "\t\t====================================="
def run(path): global fp # load article text article = data.Article(path) utils.load_data(article.text) fp = file("results.txt", "w") # show article text print_to_screen_and_file("-"*80) print_to_screen_and_file("Original article:\n") print_to_screen_and_file(article.text) print_to_screen_and_file("-"*80) print_to_screen_and_file("Categories:\n") top5 = pickle.load(open(config.TOP5_CATEGORIES, "r")); # list of: [catname, count, tag] print_to_screen_and_file("In article: " + str(article.cats)) print_to_screen_and_file("Top5: " + str(top5)) ground_truth = [tag for cat, count, tag in top5 if cat in article.cats] print_to_screen_and_file("Present from Top5: " + str(ground_truth)) print_to_screen_and_file("-"*80) # make the summary & show in console print_to_screen_and_file("I Summary:\n") instance = SimpleSummarizer() # shorten the original article by one third print_to_screen_and_file(instance.summarize(article.text, len(utils.sentences) / 3)) print_to_screen_and_file("-"*80) print_to_screen_and_file("II Summary:\n") print_to_screen_and_file(" ".join(ph_reduction.PhraseReductor().find(utils.tagged_sentences))) print_to_screen_and_file("-"*80) # classification print_to_screen_and_file("Multiclass classification:\n") stemmer = nltk.stem.WordNetLemmatizer() words = nltk.tokenize.wordpunct_tokenize(article.text) feats = utils.bag_of_words(words, article.text, stemmer) classifier = pickle.load(file(config.BAYES_CLASSIFIER_FILE, 'r')) b_class = classifier.classify(feats) print_to_screen_and_file("BayesClassifier class: " + b_class + ", is correct? " + str(b_class in ground_truth)) classifier = pickle.load(file(config.MAXENT_CLASSIFIER_FILE, 'r')) m_class = classifier.classify(feats) print_to_screen_and_file("MaxEntClassifier class: " + m_class + ", is correct? " + str(m_class in ground_truth)) classifier = pickle.load(file(config.DTREE_CLASSIFIER_FILE, 'r')) d_class = classifier.classify(feats) print_to_screen_and_file("DecisionTreeClassifier class: " + d_class + ", is correct? " + str(d_class in ground_truth)) print_to_screen_and_file("-"*80) print_to_screen_and_file("Binary classification:\n") title = ["BayesClassifier: ", "MaxEntClassifier: ", "DecisionTreeClassifier: "] classifiers = [config.BAYES_CLASSIFIER_FILE_PATTERN, config.MAXENT_CLASSIFIER_FILE_PATTERN, config.DTREE_CLASSIFIER_FILE_PATTERN] tags = ["A", "B", "C", "D", "E", "OTHER"] for index, typename in enumerate(classifiers): results = {} accuracy = 0 for tag in tags: fname = typename%(tag) classifier = pickle.load(file(fname, 'r')) results[tag] = classifier.classify(feats) if results[tag] == "yes": if (tag in ground_truth): accuracy += 1 elif results[tag] == "no": if (tag not in ground_truth): accuracy += 1 print_to_screen_and_file(title[index] + str(results)+", accuracy: " + str(accuracy*100/len(tags)) + "%") print_to_screen_and_file("-"*80) # people actions print_to_screen_and_file("People and their actions:\n") work = action.Actions().find(utils.tagged_words, utils.tagged_sentences, utils.people) # print the updated info with people actions for i, (key, value) in enumerate(work.items()): print_to_screen_and_file("[%d] - %s = %s"%(i+1, key, value)) print_to_screen_and_file("-"*80) # anaphora print_to_screen_and_file("Anaphoras:\n") refs = references.References().find(utils.people, utils.sentences, utils.tagged_sentences) for ref, fullname, index in refs: print_to_screen_and_file("Sentence["+str(index+1)+"]: " + ref + " - "+ fullname) print_to_screen_and_file("-"*80) # interactions print_to_screen_and_file("People interactions:\n") inter = interactions.Interactor().find(refs, utils.tagged_sentences) for index, item in enumerate(inter): who, prp, what = item['who'], item['prp'], item['what'] s = "["+str(index+1)+"]:" for i in xrange(len(who)): if prp[i] and who[i]: s += " " + who[i] + "(" + prp[i] + "), " elif prp[i]: s += prp[i] + ", " elif who[i]: s += " " + who[i] + ", " s += " - " + ", ".join(what) print_to_screen_and_file(s) print_to_screen_and_file("-"*80) print "Finished." fp.close()
def main(): #Operations is the speech recognition engine speechRecog = SpeechRecog() query = speechRecog.inputSpeech() print query ttsEngine.speakText("Returned query is " + query) speechRecog.searchQuery(query) ttsEngine.speakText("returned from second function!!!!") dict_hrefs = {} #To add the href functionality dict_hrefs = speechRecog.getHrefs() #print dict_hrefs #To remove the additional CSS, HTML and XML tags removeTags=removeHTMLtags() file=open("Bill.txt") html=file.read() file.close() print "***************************************" #print html #invalid_tags = ['p', 'i', 'u'] #removeTags.strip_tags(html, invalid_tags) html = removeTags.StripTags(html) file=open("Bill.txt","w") file.write(html) file.close() print "***************************************" file=open('Bill.txt') txt=file.read() file.close() #print txt obj = SimpleSummarizer() newTxt = obj.summarize(txt,40) print "*******************************************" regExp=RegexpTokenizer('\w*;$') txtCleanUp=LineTokenizer().tokenize(newTxt) s="" a = "" for i in txtCleanUp: a=regExp.tokenize(i) if a: print a else: s = "" + i print s file1=open("newTxt.txt","w") file1.write(s) file1.close() ttsEngine.speakText("The summarized contents are: ") k = KeyEvents() k.speakAndListen(s) print 'done'