def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word at a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # Convert all words to lowercase prior to insertion w = w.lower() # If the word is already in the hash map, pass the value with a new updated count if ht.contains_key(w): count = ht.get(w) + 1 ht.put(w, count) else: # Otherwise, create a new entry in the hashmap ht.put(w, 1) # Add all of the words to the keys set for bucket in ht.get_buckets(): # Iterate through each bucket/linked list curr = bucket.head while curr is not None: # Add the keys as a tuple keys.add((curr.key, curr.value)) curr = curr.next # Cast the set as a list all_words = list(keys) # Sort the words according to their value in the tuple all_words.sort(key=lambda word: word[1]) slice_val = (number * -1 - 1) top_wds = all_words[:slice_val:-1] return top_wds
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: w = w.lower() count = ht.get(w) if ht.contains_key(w): ht.put(w, count + 1) else: ht.put(w, 1) # Add all items in hash table to list sorted_list = [] for list in ht.get_buckets(): current = list.head for tuple in range(list.size): sorted_list.append((current.key, current.value)) current = current.next # Sort list by value in descending order. Return given number of 'top-words'. sorted_list = sorted(sorted_list, key=get_second, reverse=True) return_list = [] for i in range(number): return_list.append(sorted_list[i]) return return_list