Python extract_words Exemples, wordtools.extract_words Python Exemples

Exemple #1

0

Afficher le fichier

def test_suite(): #Few tests of "add_vectors" function
    test(wdt.cleanword("what?") == "what")
    test(wdt.cleanword("'now!'") == "now")
    test(wdt.cleanword("?+='w-o-r-d!,@$()'") ==  "word")
        
    test(wdt.has_dashdash("distance--but"))
    test(not wdt.has_dashdash("several"))
    test(wdt.has_dashdash("spoke--"))
    test(wdt.has_dashdash("distance--but"))
    test(not wdt.has_dashdash("-yo-yo-"))

    test(wdt.extract_words("Now is the time!  'Now', is the time? Yes, now.") == ['now','is','the','time','now','is','the','time','yes','now'])
    test(wdt.extract_words("she tried to curtsey as she spoke--fancy") == ['she','tried','to','curtsey','as','she','spoke','fancy'])

    test(wdt.wordcount("now", ["now","is","time","is","now","is","is"]) == 2)
    test(wdt.wordcount("is", ["now","is","time","is","now","the","is"]) == 3)
    test(wdt.wordcount("time", ["now","is","time","is","now","is","is"]) == 1)
    test(wdt.wordcount("frog", ["now","is","time","is","now","is","is"]) == 0)

    test(wdt.wordset(["now", "is", "time", "is", "now", "is", "is"]) == ["is", "now", "time"])
    test(wdt.wordset(["I", "a", "a", "is", "a", "is", "I", "am"]) == ["I", "a", "am", "is"])
    test(wdt.wordset(["or", "a", "am", "is", "are", "be", "but", "am"]) == ["a", "am", "are", "be", "but", "is", "or"])

    test(wdt.longestword(["a", "apple", "pear", "grape"]) == 5)
    test(wdt.longestword(["a", "am", "I", "be"]) == 2)
    test(wdt.longestword(["this","supercalifragilisticexpialidocious"]) == 34)
    test(wdt.longestword([ ]) == 0)

Exemple #2

0

Afficher le fichier

wordtools.test(wordtools.has_dashdash("distance--but"), 1)

print("test(not has_dashdash(\"several\")")
wordtools.test(wordtools.has_dashdash("spoke--"), 1)

print("test(has_dashdash(\"distance--but\")")
wordtools.test(wordtools.has_dashdash("distance--but"), 1)

print("test(not has_dashdash(\"-yo-yo-\")")
wordtools.test(wordtools.has_dashdash("-yo-yo-"), 0)

print(
    "test(extract_words(\"Now is the time!  'Now', is the time? Yes, now.\") == \['now','is','the','time','now','is','the','time','yes','now'\]"
)
wordtools.test(
    wordtools.extract_words("Now is the time!  'Now', is the time? Yes, now."),
    ['now', 'is', 'the', 'time', 'now', 'is', 'the', 'time', 'yes', 'now'])

print(
    "test(extract_words(\"she tried to curtsey as she spoke--fancy\") == \['she','tried','to','curtsey','as','she','spoke','fancy'\]"
)
wordtools.test(
    wordtools.extract_words("she tried to curtsey as she spoke--fancy"),
    ['she', 'tried', 'to', 'curtsey', 'as', 'she', 'spoke', 'fancy'])

print(
    "test(wordcount(\"now\", [\"now\",\"is\",\"time\",\"is\",\"now\",\"is\",\"is\"]) == 2)"
)
a = wordtools.wordcount("now", ["now", "is", "time", "is", "now", "is", "is"])
wordtools.test(2, 2)

Exemple #3

0

Afficher le fichier

print('clean word:')
test(cleanword("what?") == "what")
test(cleanword("'now!'") == "now")
test(cleanword("?+='w-o-r-d!,@$()'") == "word")
print()
print('has dash dash:')
test(has_dashdash("distance--but"))
test(not has_dashdash("several"))
test(has_dashdash("spoke--"))
test(has_dashdash("distance--but"))
test(not has_dashdash("-yo-yo-"))
print()

print('extract words:')
test(extract_words("Now is the time! 'Now', is the time? Yes, now.") ==\
     ['now','is','the','time','now','is','the','time','yes','now'])
test(extract_words("she tried to curtsey as she spoke--fancy") ==\
    ['she','tried','to','curtsey','as','she','spoke','fancy'])
print()

print('wordcount:')
test(wordcount("now", ["now", "is", "time", "is", "now", "is", "is"]) == 2)
test(wordcount("is", ["now", "is", "time", "is", "now", "the", "is"]) == 3)
test(wordcount("time", ["now", "is", "time", "is", "now", "is", "is"]) == 1)
test(wordcount("frog", ["now", "is", "time", "is", "now", "is", "is"]) == 0)
print()

print('wordset:')
test(wordset(["now", "is", "time", "is", "now", "is","is"]) ==\
    ["is", "now", "time"])

Exemple #4

0

Afficher le fichier

# exercise 20.3

import urllib.request
import wordtools

def retrieve_page(url):
    my_socket = urllib.request.urlopen(url)
    dta = str(my_socket.read())
    my_socket.close()
    return dta


alice_book = retrieve_page("http://www.gutenberg.org/cache/epub/11/pg11.txt")
alice_book = alice_book[801:158020]
rn_removed = ' '.join(alice_book.split("\\r\\n"))
words = wordtools.extract_words(rn_removed)
word_occurences = {}
for word in words:
    if not word == "":
        word_occurences[word] = word_occurences.get(word, 0) + 1


alice_words = open("alice_words.txt", "w")
alice_words.write("Word\t\t\tCount\n")
alice_words.write("==============================\n")
for (u, v) in sorted(word_occurences.items()):
    alice_words.write("{0:25}{1}\n".format(u, v))


print("The word 'alice' occurs {0} times in the book.".format(word_occurences["alice"]))

Exemple #5

0

Afficher le fichier

import wordtools


def retrieve_page(url):
    """ Retrieve the contents of a web page.
        The contents is converted to a string before returning it.
    """
    my_socket = urllib.request.urlopen(url)
    dta = str(my_socket.read())
    my_socket.close()
    return dta


#get text from a internet site
the_text = retrieve_page(
    "http://www.gutenberg.org/cache/epub/28885/pg28885.txt")

output_file = open("alice_words.txt", "w")
#prepare the output file
output_file.writelines(
    ["Word              Count\n", "=======================\n"])

#count every word
db_word = {}
for word in wordtools.extract_words(the_text):
    db_word[word] = db_word.get(word, 0) + 1

for word in sorted(db_word):
    output_file.write("{0:<18}{1}\n".format(word, db_word[word]))

output_file.close()