コード例 #1
0
def get_common_words(text_storage: shelve.DbfilenameShelf,
                     amount_of_common_words: int) -> [(str, int)]:
    stop_words = set()  # To hold the set of stop words
    word_frequencies = defaultdict(
        int)  # To hold the amount of occurrences for non-stop words

    # Enchant setup adapted from their tutorial: https://pyenchant.github.io/pyenchant/tutorial.html
    dictionary = enchant.Dict("en_US")  # To validate words

    # Open up the stop_words file and read in the set of stop words
    with open("stop_words.txt", "r") as file_input_stream:
        for next_word in file_input_stream:
            stop_words.add(next_word.rstrip())

    # Loop through the text for each webpage
    for next_webpage_text in text_storage.values():

        # Split the webpage according to all whitespace, dashes, and hyphens
        for next_word in re.split(r"[\s\-–]", next_webpage_text):

            # Remove special characters from the words (if any)
            next_word = re.sub(r"[.,?:!;()\[\]{}\"]", "", next_word)

            # If the next word contains only alphabetical characters (and some special characters),
            # is a recognizable English word, and is not a stop word, increment its frequency
            if (re.match(r"^[a-zA-Z']+$", next_word) is not None) and \
               dictionary.check(next_word) and \
               (next_word.lower() not in stop_words):
                word_frequencies[next_word.lower()] += 1

    # Sort the words according to their frequency in descending order and return them
    words_in_descending_frequency = \
        [(next_word, frequency) for next_word, frequency in sorted(word_frequencies.items(), key=lambda x: (-x[1]))]

    return words_in_descending_frequency[:amount_of_common_words]