def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()  # Variable representing empty set.
    ht = HashMap(
        2500, hash_function_2
    )  # Variable to represent hash map construct utilizing above function.

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source
              ) as f:  # Opens file to be used declaring it as variable 'f'.
        for line in f:  # Loops through each line within file (f).
            words = rgx.findall(
                line)  # Variable utilized to represent words within each line.
            for w in words:  # Loops through each word within each line.
                lw = w.lower(
                )  # Turns words lowercase to remove case sensitivity.
                keys.add(
                    lw
                )  # Adds lowercase word to set represented by variable 'key'.
                if ht.contains_key(
                        lw):  # Checks if word is already present in hash map.
                    new_value = (
                        ht.get(lw) + 1
                    )  # Variable represents word count increased by one.
                    ht.put(
                        lw, new_value
                    )  # Inserts word into hash map to have word count be updated.
                else:
                    ht.put(
                        lw, 1
                    )  # Inserts word into hash map with initial count of one.
    keys_list = []  # Variable represents an empty list.
    for values in keys:  # Loops through words present in set represented by variable 'keys'.
        ind = ht._hash_function(values) % ht.capacity
        # Variable to represent number established by chosen function and available capacity.
        temp = ht._buckets[
            ind]  # Variable to represent position within hash map containing linked list.
        node = temp.contains(
            values
        )  # Variable to represent node containing key if already present.
        keys_list.append(
            (node.key,
             node.value))  # Adds tuple to list containing word, word count.
    keys_list.sort(
        key=lambda tup: tup[1],
        reverse=True)  # Sorts list in descending order based on word count.
    return keys_list[
        0:
        number]  # Returns list of top words within given range provided by user.
Ejemplo n.º 2
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                #append the individual words to the list and convert letters
                #to lowercase for case sensitivity
                lower_case = w.lower()
                keys.add(lower_case)
                #check if word is alread in hashmap
                if ht.contains_key(lower_case):
                    #increase word count and insert into hasmap and update count
                    val = (ht.get(lower_case) + 1)
                    ht.put(lower_case, val)
                else:
                    #insert into hasmap with initial count being one 1 if not in hashmap already
                    ht.put(lower_case, 1)
    #create a new list if words
    word_list = []
    #loop thru the list
    for k in keys:
        index = ht._hash_function(k) % ht.capacity
        temp = ht._buckets[index]
        #add tuples to list containing word and count
        linked_node = temp.contains(k)
        word_list.append((linked_node.key, linked_node.value))
    #sort list in descending order
    word_list.sort(key=lambda tup: tup[1], reverse=True)
    #return list of top words
    return word_list[0:number]


# print(top_words("alice.txt",10))  # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE