def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() # Variable representing empty set. ht = HashMap( 2500, hash_function_2 ) # Variable to represent hash map construct utilizing above function. # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source ) as f: # Opens file to be used declaring it as variable 'f'. for line in f: # Loops through each line within file (f). words = rgx.findall( line) # Variable utilized to represent words within each line. for w in words: # Loops through each word within each line. lw = w.lower( ) # Turns words lowercase to remove case sensitivity. keys.add( lw ) # Adds lowercase word to set represented by variable 'key'. if ht.contains_key( lw): # Checks if word is already present in hash map. new_value = ( ht.get(lw) + 1 ) # Variable represents word count increased by one. ht.put( lw, new_value ) # Inserts word into hash map to have word count be updated. else: ht.put( lw, 1 ) # Inserts word into hash map with initial count of one. keys_list = [] # Variable represents an empty list. for values in keys: # Loops through words present in set represented by variable 'keys'. ind = ht._hash_function(values) % ht.capacity # Variable to represent number established by chosen function and available capacity. temp = ht._buckets[ ind] # Variable to represent position within hash map containing linked list. node = temp.contains( values ) # Variable to represent node containing key if already present. keys_list.append( (node.key, node.value)) # Adds tuple to list containing word, word count. keys_list.sort( key=lambda tup: tup[1], reverse=True) # Sorts list in descending order based on word count. return keys_list[ 0: number] # Returns list of top words within given range provided by user.
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: #append the individual words to the list and convert letters #to lowercase for case sensitivity lower_case = w.lower() keys.add(lower_case) #check if word is alread in hashmap if ht.contains_key(lower_case): #increase word count and insert into hasmap and update count val = (ht.get(lower_case) + 1) ht.put(lower_case, val) else: #insert into hasmap with initial count being one 1 if not in hashmap already ht.put(lower_case, 1) #create a new list if words word_list = [] #loop thru the list for k in keys: index = ht._hash_function(k) % ht.capacity temp = ht._buckets[index] #add tuples to list containing word and count linked_node = temp.contains(k) word_list.append((linked_node.key, linked_node.value)) #sort list in descending order word_list.sort(key=lambda tup: tup[1], reverse=True) #return list of top words return word_list[0:number] # print(top_words("alice.txt",10)) # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE