def test_suite(): #Few tests of "add_vectors" function test(wdt.cleanword("what?") == "what") test(wdt.cleanword("'now!'") == "now") test(wdt.cleanword("?+='w-o-r-d!,@$()'") == "word") test(wdt.has_dashdash("distance--but")) test(not wdt.has_dashdash("several")) test(wdt.has_dashdash("spoke--")) test(wdt.has_dashdash("distance--but")) test(not wdt.has_dashdash("-yo-yo-")) test(wdt.extract_words("Now is the time! 'Now', is the time? Yes, now.") == ['now','is','the','time','now','is','the','time','yes','now']) test(wdt.extract_words("she tried to curtsey as she spoke--fancy") == ['she','tried','to','curtsey','as','she','spoke','fancy']) test(wdt.wordcount("now", ["now","is","time","is","now","is","is"]) == 2) test(wdt.wordcount("is", ["now","is","time","is","now","the","is"]) == 3) test(wdt.wordcount("time", ["now","is","time","is","now","is","is"]) == 1) test(wdt.wordcount("frog", ["now","is","time","is","now","is","is"]) == 0) test(wdt.wordset(["now", "is", "time", "is", "now", "is", "is"]) == ["is", "now", "time"]) test(wdt.wordset(["I", "a", "a", "is", "a", "is", "I", "am"]) == ["I", "a", "am", "is"]) test(wdt.wordset(["or", "a", "am", "is", "are", "be", "but", "am"]) == ["a", "am", "are", "be", "but", "is", "or"]) test(wdt.longestword(["a", "apple", "pear", "grape"]) == 5) test(wdt.longestword(["a", "am", "I", "be"]) == 2) test(wdt.longestword(["this","supercalifragilisticexpialidocious"]) == 34) test(wdt.longestword([ ]) == 0)
wordtools.test(wordtools.has_dashdash("distance--but"), 1) print("test(not has_dashdash(\"several\")") wordtools.test(wordtools.has_dashdash("spoke--"), 1) print("test(has_dashdash(\"distance--but\")") wordtools.test(wordtools.has_dashdash("distance--but"), 1) print("test(not has_dashdash(\"-yo-yo-\")") wordtools.test(wordtools.has_dashdash("-yo-yo-"), 0) print( "test(extract_words(\"Now is the time! 'Now', is the time? Yes, now.\") == \['now','is','the','time','now','is','the','time','yes','now'\]" ) wordtools.test( wordtools.extract_words("Now is the time! 'Now', is the time? Yes, now."), ['now', 'is', 'the', 'time', 'now', 'is', 'the', 'time', 'yes', 'now']) print( "test(extract_words(\"she tried to curtsey as she spoke--fancy\") == \['she','tried','to','curtsey','as','she','spoke','fancy'\]" ) wordtools.test( wordtools.extract_words("she tried to curtsey as she spoke--fancy"), ['she', 'tried', 'to', 'curtsey', 'as', 'she', 'spoke', 'fancy']) print( "test(wordcount(\"now\", [\"now\",\"is\",\"time\",\"is\",\"now\",\"is\",\"is\"]) == 2)" ) a = wordtools.wordcount("now", ["now", "is", "time", "is", "now", "is", "is"]) wordtools.test(2, 2)
print('clean word:') test(cleanword("what?") == "what") test(cleanword("'now!'") == "now") test(cleanword("?+='w-o-r-d!,@$()'") == "word") print() print('has dash dash:') test(has_dashdash("distance--but")) test(not has_dashdash("several")) test(has_dashdash("spoke--")) test(has_dashdash("distance--but")) test(not has_dashdash("-yo-yo-")) print() print('extract words:') test(extract_words("Now is the time! 'Now', is the time? Yes, now.") ==\ ['now','is','the','time','now','is','the','time','yes','now']) test(extract_words("she tried to curtsey as she spoke--fancy") ==\ ['she','tried','to','curtsey','as','she','spoke','fancy']) print() print('wordcount:') test(wordcount("now", ["now", "is", "time", "is", "now", "is", "is"]) == 2) test(wordcount("is", ["now", "is", "time", "is", "now", "the", "is"]) == 3) test(wordcount("time", ["now", "is", "time", "is", "now", "is", "is"]) == 1) test(wordcount("frog", ["now", "is", "time", "is", "now", "is", "is"]) == 0) print() print('wordset:') test(wordset(["now", "is", "time", "is", "now", "is","is"]) ==\ ["is", "now", "time"])
# exercise 20.3 import urllib.request import wordtools def retrieve_page(url): my_socket = urllib.request.urlopen(url) dta = str(my_socket.read()) my_socket.close() return dta alice_book = retrieve_page("http://www.gutenberg.org/cache/epub/11/pg11.txt") alice_book = alice_book[801:158020] rn_removed = ' '.join(alice_book.split("\\r\\n")) words = wordtools.extract_words(rn_removed) word_occurences = {} for word in words: if not word == "": word_occurences[word] = word_occurences.get(word, 0) + 1 alice_words = open("alice_words.txt", "w") alice_words.write("Word\t\t\tCount\n") alice_words.write("==============================\n") for (u, v) in sorted(word_occurences.items()): alice_words.write("{0:25}{1}\n".format(u, v)) print("The word 'alice' occurs {0} times in the book.".format(word_occurences["alice"]))
import wordtools def retrieve_page(url): """ Retrieve the contents of a web page. The contents is converted to a string before returning it. """ my_socket = urllib.request.urlopen(url) dta = str(my_socket.read()) my_socket.close() return dta #get text from a internet site the_text = retrieve_page( "http://www.gutenberg.org/cache/epub/28885/pg28885.txt") output_file = open("alice_words.txt", "w") #prepare the output file output_file.writelines( ["Word Count\n", "=======================\n"]) #count every word db_word = {} for word in wordtools.extract_words(the_text): db_word[word] = db_word.get(word, 0) + 1 for word in sorted(db_word): output_file.write("{0:<18}{1}\n".format(word, db_word[word])) output_file.close()