def get_common_words(text_storage: shelve.DbfilenameShelf, amount_of_common_words: int) -> [(str, int)]: stop_words = set() # To hold the set of stop words word_frequencies = defaultdict( int) # To hold the amount of occurrences for non-stop words # Enchant setup adapted from their tutorial: https://pyenchant.github.io/pyenchant/tutorial.html dictionary = enchant.Dict("en_US") # To validate words # Open up the stop_words file and read in the set of stop words with open("stop_words.txt", "r") as file_input_stream: for next_word in file_input_stream: stop_words.add(next_word.rstrip()) # Loop through the text for each webpage for next_webpage_text in text_storage.values(): # Split the webpage according to all whitespace, dashes, and hyphens for next_word in re.split(r"[\s\-–]", next_webpage_text): # Remove special characters from the words (if any) next_word = re.sub(r"[.,?:!;()\[\]{}\"]", "", next_word) # If the next word contains only alphabetical characters (and some special characters), # is a recognizable English word, and is not a stop word, increment its frequency if (re.match(r"^[a-zA-Z']+$", next_word) is not None) and \ dictionary.check(next_word) and \ (next_word.lower() not in stop_words): word_frequencies[next_word.lower()] += 1 # Sort the words according to their frequency in descending order and return them words_in_descending_frequency = \ [(next_word, frequency) for next_word, frequency in sorted(word_frequencies.items(), key=lambda x: (-x[1]))] return words_in_descending_frequency[:amount_of_common_words]