def test_get(self):
        """Tests the HashMap get method"""
        test_values = [("test_5", 5), ("test_-5", -5), ("test_5_", 5),
                       ("diff_word", 15), ("another_word", 20), ("set", 10),
                       ("anotha_one", -7), ("completely_different", 5),
                       ("getting_there", -1)]

        collision_values = [("completely_different", 5), ("anotha_one", -7),
                            ("set", 10), ("another_word", 20)]
        head_node = collision_values[0]
        tail_node = collision_values[3]
        student_map = HashMap(10, hash_function_1)

        # add all key value pairs to the table
        for key, val in test_values:
            student_map.put(key, val)

        # test get at linked_list head
        self.assertEqual(student_map.get(head_node[0]), head_node[1])

        # test get at linked_list tail
        self.assertEqual(student_map.get(tail_node[0]), tail_node[1])

        # test get at > 2 collision bucket
        for node in collision_values:
            self.assertEqual(student_map.get(node[0]), node[1])

        # test get with no collision
        self.assertEqual(student_map.get("getting_there"), -1)

        # test that all values are in the list
        for node in test_values:
            self.assertEqual(student_map.get(node[0]), node[1])
Ejemplo n.º 2
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # Reads a file one word as a time and
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                w = w.lower(
                )  # covert word to lowercase for case-insensitive comparisons
                if ht.contains_key(
                        w
                ):  # if word already exists as key in ht, add 1 to value to track count
                    value = ht.get(w)
                    ht.put(w, value + 1)

                else:
                    ht.put(
                        w, 1
                    )  # if word does not exist in ht as key, add word as key and initialize value as 1
                    keys.add(w)  # add word to set of keys

    count_dict = {}  # initialize empty dictionary
    count_array = []  # initialize empty array

    for key in keys:  # for each key, get it's value from ht and then add key/value pair to count_dict
        value = ht.get(key)
        count_dict[key] = value

    for key in keys:  # for each key, add value/key pair to array for sorting
        count_array.append((count_dict[key], key))

    count_array = sorted(
        count_array, reverse=True
    )  # reverse sort count_array from largest to smallest value

    for i in range(
            len(count_array)
    ):  # reswap key/value pairs to get (word, count) for each tuple in count_array
        count_array[i] = (count_array[i][1], count_array[i][0])

    return count_array[:
                       number]  # return only the requested number of top words
Ejemplo n.º 3
0
	def test_change_val(self):
		h = HashMap(3)
		h.set('1', SampleObject('A'))
		b_obj = SampleObject('B')
		h.set('2', b_obj)
		self.assertEqual(h.get('2'), b_obj)
		h.set('3', SampleObject('C'))
		b_obj_2 = SampleObject('B2')
		h.set('2', b_obj_2)
		self.assertEqual(h.get('2'), b_obj_2)
Ejemplo n.º 4
0
 def test_resize_table_1(self):
     """
     Test resize_table() with Example #1 from the guidelines.
     :passed: yes
     """
     print("--- EXAMPLE 1 ---")
     m = HashMap(20, hash_function_1)
     m.put('key1', 10)
     print(m.size, m.capacity, m.get('key1'), m.contains_key('key1'))
     m.resize_table(30)
     print(m.size, m.capacity, m.get('key1'), m.contains_key('key1'))
Ejemplo n.º 5
0
 def test_remove_1(self):
     """
     Test remove() with Example #1 from the guidelines.
     :passed: yes
     """
     print("--- EXAMPLE 1 ---")
     m = HashMap(50, hash_function_1)
     print(m.get('key1'))
     m.put('key1', 10)
     print(m.get('key1'))
     m.remove('key1')
     print(m.get('key1'))
     m.remove('key4')
Ejemplo n.º 6
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and  |
    # put the word in `w`. It should be left as starter code.     |
    with open(source) as f:  # |
        for line in f:  # |
            words = rgx.findall(line)  # |
            for w in words:  # |
                # lower case all incoming words to make case insensitive
                w = w.lower()
                # Check if hashtable already contains word
                if ht.contains_key(w):
                    # If so retrieve the count for the given word
                    count = ht.get(w)
                    # Update existing key with incremented count value
                    ht.put(w, count + 1)
                else:
                    # Add new word to keys set collection
                    keys.add(w)
                    # put new word in hash table with a count of 1
                    ht.put(w, 1)
                # Check if table load is over load limit before next word
                if ht.table_load() > 8:
                    # if so, resize hash table to twice the capacity
                    ht.resize_table(2 * ht.capacity)
    # initialize tuple word/count list
    topWords = []
    # for each key in the set of keys
    for key in keys:
        # append the key and value as a tuple in the topWords list
        topWords.append((key, ht.get(key)))
    # once all tuples are added to list, sort list by the count of each key in descending order
    topWords.sort(key=lambda keyCountTup: keyCountTup[1], reverse=True)
    # After sort, set top word list to only contain the given number of tuples requested
    topWords = topWords[:number]
    # return topWords list of tuples of length equal to number
    return topWords
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:  # Iterate through all the words in the line.
                # Place lowercased version of words.
                if ht.contains_key(
                        w.lower()):  # If the word is in the hashmap.
                    ht.put(w.lower(),
                           ht.get(w.lower()) +
                           1)  # Update the word count by 1.
                else:  # If the word does not exist in the hashmap.
                    ht.put(w.lower(),
                           1)  # Place the key in the map with the value of 1.
                    keys.add(w.lower())  # Add the new keys into the keys set.

    list_of_occurences = [
    ]  # Create an empty list to hold the tuples of keys and values.
    for key in keys:  # Iterate through all the keys.
        list_of_occurences.append(
            (key, ht.get(key)))  # Add the key and value tuple into the list.

    # Source to help me find a way to implement this:
    # stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value

    # We use lambda here to sort the list of tuples by its second value.
    # The sorting is also reversed to make it in descending order.
    sorted_list = sorted(list_of_occurences, key=lambda x: x[1], reverse=True)
    return sorted_list[:
                       number]  # Using list slice, return the top numbers of the list depending on what the user inputs


# print(top_words("alice.txt",10))  # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
Ejemplo n.º 8
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500,hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                w = w.lower() # make everything lower case

                # if the word is already in keys, get and increase the count.
                if w in keys:
                    count = ht.get(w)
                    count += 1
                    ht.put(w, count) # update the count

                # otherwise add the new word to the hashmap with count of 1
                else:
                    ht.put(w, 1)
                    keys.add(w)

    count_list = [] # list of tuples of keys and counts

    # for each key, get the value, and add these pairs as tuples to count_list
    for k in keys:
        val = ht.get(k)
        pair = (k, val)
        count_list.append(pair)

    # sort count_list with a lambda function using sorted(). Reverse the list for descending order
    count_list = sorted(count_list, key=lambda x: x[1], reverse=True) # where x is a tuple and x[1] is the word count

    # return the appropriate number of top words
    return count_list[:number]
Ejemplo n.º 9
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                if (ht.get(w) is None):
                    ht.put(w, 1)
                else:
                    #Buffer into hash map
                    ht.put(w, ht.get(w) + 1)

    #Handle sorting
    tuples = []

    for i in range(ht.capacity):
        cur = ht._buckets[i]
        if cur.head is not None:
            cur = cur.head
            while (cur != None):
                #Buffer key pair into tuples
                tuples.append((cur.key, cur.value))

                #Go to next
                cur = cur.next

    tuples = sorted(tuples, key=lambda x: x[1], reverse=True)
    newTuples = []

    for j in range(number):
        newTuples.append(tuples[j])
    return newTuples
Ejemplo n.º 10
0
def test_all():
    hash_map = HashMap()
    el_count = EL_COUNT
    for x in range(el_count):
        hash_map.put(str(x), str(x))
    for x in range(el_count):
        assert hash_map.get(str(x)) == str(x)
Ejemplo n.º 11
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500,hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                if ht.contains_key(w.lower()):
                    ht.put(w.lower(), ht.get(w.lower()) + 1)
                else:
                    ht.put(w.lower(), 1)

        tup = ht.sorted_tup()

        return tup[:number]
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()  # Variable representing empty set.
    ht = HashMap(
        2500, hash_function_2
    )  # Variable to represent hash map construct utilizing above function.

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source
              ) as f:  # Opens file to be used declaring it as variable 'f'.
        for line in f:  # Loops through each line within file (f).
            words = rgx.findall(
                line)  # Variable utilized to represent words within each line.
            for w in words:  # Loops through each word within each line.
                lw = w.lower(
                )  # Turns words lowercase to remove case sensitivity.
                keys.add(
                    lw
                )  # Adds lowercase word to set represented by variable 'key'.
                if ht.contains_key(
                        lw):  # Checks if word is already present in hash map.
                    new_value = (
                        ht.get(lw) + 1
                    )  # Variable represents word count increased by one.
                    ht.put(
                        lw, new_value
                    )  # Inserts word into hash map to have word count be updated.
                else:
                    ht.put(
                        lw, 1
                    )  # Inserts word into hash map with initial count of one.
    keys_list = []  # Variable represents an empty list.
    for values in keys:  # Loops through words present in set represented by variable 'keys'.
        ind = ht._hash_function(values) % ht.capacity
        # Variable to represent number established by chosen function and available capacity.
        temp = ht._buckets[
            ind]  # Variable to represent position within hash map containing linked list.
        node = temp.contains(
            values
        )  # Variable to represent node containing key if already present.
        keys_list.append(
            (node.key,
             node.value))  # Adds tuple to list containing word, word count.
    keys_list.sort(
        key=lambda tup: tup[1],
        reverse=True)  # Sorts list in descending order based on word count.
    return keys_list[
        0:
        number]  # Returns list of top words within given range provided by user.
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    # build hash table
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                # convert to lowercase so words are counted properly
                lowercase_w = w.lower()
                keys.add(lowercase_w)
                word_count = ht.get(lowercase_w)
                # have value of node track number of times word has appeared
                if word_count is None:
                    ht.put(lowercase_w, 1)
                else:
                    ht.put(lowercase_w, word_count + 1)
    # for the amount of top words requested, find the word with maximum count
    max_list = []
    for count in range(number):
        max_w = ""
        max_value = 0
        # iterate over all words to find max key, value
        for w in keys:
            value = ht.get(w)
            if value > max_value:
                max_key = w
                max_value = value
        max_list.append((max_key, max_value))
        # remove max word from set for next iteration to get next top word
        keys.remove(max_key)
    return max_list
Ejemplo n.º 14
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:

                # Place word in hash map or update value by one
                w = w.lower()
                if not ht.contains_key(w):
                    keys.add(w)
                    ht.put(w, 1)
                else:
                    ht.put(w, ht.get(w) + 1)

    # Sort the words by mentions
    sorted_words = []
    for word in keys:
        next_word = (word, ht.get(word))
        if sorted_words == []:
            sorted_words.append(next_word)
        else:
            for index, value in enumerate(sorted_words):
                if next_word[1] >= value[1]:
                    sorted_words.insert(index, next_word)
                    break

    return sorted_words[:number]


# print(top_words("alice.txt",10))  # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
Ejemplo n.º 15
0
	def test_get(self):
		h = HashMap(100)
		h.set('1', SampleObject('A'))
		b_obj = SampleObject('B')
		h.set('2', b_obj)
		h.set('3', SampleObject('C'))
		self.assertEqual(h.get('2'), b_obj)
		self.assertEqual(h.get('4'), None)
Ejemplo n.º 16
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                word = w.lower()

                # check if the word is in the hash table
                if word in keys:

                    # if the word is in the hash table, add one to the value of it's node
                    ht.put(word, ht.get(word) + 1)

                # if the word is not in the hash table, add the word to the table with a value of one
                else:
                    keys.add(word)
                    ht.put(word, 1)

    # place all words and counts in an array as tuples
    words_count = [(key, ht.get(key)) for key in keys]

    # sort the words in the hash table by count
    words_count.sort(reverse=True, key=sort_by_value)

    return words_count[:number]
Ejemplo n.º 17
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500,hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                w = w.lower()
                if ht.contains_key(w):
                    node_value = ht.get(w)
                    node_value += 1
                    ht.put(w, node_value)
                else:
                    node_value = 1
                    ht.put(w, node_value)
                    keys.add(w)
    key_value_arr = []
    for i in keys:
        key_value = (i, ht.get(i))
        key_value_arr.append(key_value)
    key_value_arr.sort(key=lambda x: x[1], reverse=True)
    top = []
    for i in range(number):
        top.append(key_value_arr[i])
    return top
Ejemplo n.º 18
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                # check if word is already in hash map (ht). if none, create an entry with value as 1
                if ht.contains_key(w) is False:
                    ht.put(w, 1)
                # if True, value += 1
                else:
                    val = ht.get(w)
                    ht.put(w, val + 1)

                # put the word in the set keys
                keys.add(w.lower())

    # create empty array, push pair values of keys in
    pairs = [(word, ht.get(word)) for word in keys]

    # sort the array, slice the array by number given
    pairs = sorted(pairs, key=lambda x: x[1], reverse=True)

    return pairs[:number]
Ejemplo n.º 19
0
class Account(object):


    # Initialize Account object with cash and hash map of stocks, where
    # stock name points to number of shares
    def __init__(self):
        self.cash = 0
        self.stocks = HashMap()


    # Compare account's cash and stocks with that of another account
    # Used in TransactionParser's reconcile() method
    def compare(self, other_acct):
        diffs = []
        other_stocks = other_acct.stocks
        all_keys = list(set(self.stocks.keys() + other_stocks.keys()))

        for key in all_keys:
            diff = self.stock_diff(key, other_stocks)
            if diff and diff != 0:
                diffs.append(key + " " + str(int(diff)))

        diffs.insert(0, "Cash " + str(self.cash_diff(other_acct.cash)))

        return "\n".join(diffs)


    def cash_diff(self, other_cash):
        return int(other_cash) - self.cash


    # Calculates differences in shares of stocks between two accounts
    def stock_diff(self, key, declared_results):
        if self.stocks[key] and declared_results[key]:
            return float(declared_results[key]) - self.stocks[key]
        elif self.stocks[key]:
            return -1 * self.stocks[key]
        elif declared_results[key] and key != "Cash":
            return float(declared_results[key])
        else:
            return None


    # Wrapper for setting a stock into self.stocks
    def set_stock(self, name, value):
        self.stocks[name] = value


    # Wrapper for getting a stock
    def get_stock(self, stock):
        return self.stocks.get(stock, 0)
Ejemplo n.º 20
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                #append the individual words to the list and convert letters
                #to lowercase for case sensitivity
                lower_case = w.lower()
                keys.add(lower_case)
                #check if word is alread in hashmap
                if ht.contains_key(lower_case):
                    #increase word count and insert into hasmap and update count
                    val = (ht.get(lower_case) + 1)
                    ht.put(lower_case, val)
                else:
                    #insert into hasmap with initial count being one 1 if not in hashmap already
                    ht.put(lower_case, 1)
    #create a new list if words
    word_list = []
    #loop thru the list
    for k in keys:
        index = ht._hash_function(k) % ht.capacity
        temp = ht._buckets[index]
        #add tuples to list containing word and count
        linked_node = temp.contains(k)
        word_list.append((linked_node.key, linked_node.value))
    #sort list in descending order
    word_list.sort(key=lambda tup: tup[1], reverse=True)
    #return list of top words
    return word_list[0:number]


# print(top_words("alice.txt",10))  # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
Ejemplo n.º 21
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word at a time and
    # put the word in `w`. It should be left as starter code.

    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                # Convert all words to lowercase prior to insertion
                w = w.lower()
                # If the word is already in the hash map, pass the value with a new updated count
                if ht.contains_key(w):
                    count = ht.get(w) + 1
                    ht.put(w, count)
                else:
                    # Otherwise, create a new entry in the hashmap
                    ht.put(w, 1)

    # Add all of the words to the keys set
    for bucket in ht.get_buckets():
        # Iterate through each bucket/linked list
        curr = bucket.head
        while curr is not None:
            # Add the keys as a tuple
            keys.add((curr.key, curr.value))
            curr = curr.next

    # Cast the set as a list
    all_words = list(keys)
    # Sort the words according to their value in the tuple
    all_words.sort(key=lambda word: word[1])
    slice_val = (number * -1 - 1)
    top_wds = all_words[:slice_val:-1]

    return top_wds
Ejemplo n.º 22
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                w = w.lower()
                count = ht.get(w)
                if count is None:
                    ht.put(w, 1)  # Word is not in the hash map; add it
                else:
                    ht.put(w, count +
                           1)  # Word is in the hash map; increment the count

    heap = Heap()  # Create a heap to do the sorting

    for i in range(ht.capacity):
        node = ht._buckets[i].head
        while node is not None:
            t = (node.key, node.value)
            heap.insert(t)  # Add each tuple to the heap
            node = node.next

    heap.sort()  # Sort by word count, descending

    if number <= len(heap.heap):
        return heap.heap[0:number]
    else:
        return heap.heap  # Handles the case where the user requests too many words
Ejemplo n.º 23
0
def top_words(source, number):
    """
    Take a plain text file and count the number of occurrences of case insensitive words.
    Return the top `number` of words in a list of tuples of the form (word, count).

    :param source: the file name containing the text
    :param number: the number of top results to return (e.g. 5 would return the 5 most common words)
    :return: a list of tuples of the form (word, count), sorted by most common word (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """
    keys = set()

    ht = HashMap(2500, hash_function_2)

    # Read the file one word at a time and put the word in `w`
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                # Convert the word to lowercase to enforce case insensitivity
                word_lower = w.lower()

                # If the word already exists in the table, get and update its
                # current count
                if ht.contains_key(word_lower):
                    cur_count = ht.get(word_lower)  # Get current count
                    ht.put(word_lower, cur_count + 1)  # Update current count

                # If the word does not exist in the table, add it and set its
                # count to 1
                else:
                    ht.put(word_lower, 1)

    # Get a list of tuples consisting of all the key-value pairs in the table
    tuple_list = ht.get_tuples()

    # Sort the list of tuples in descending order by word count
    tuple_list.sort(key=get_count, reverse=True)
    # print("sorted tuple_list:", tuple_list)

    # Slice the list of tuples to contain `number` amount of tuples
    sliced_list = tuple_list[0:number]

    # Return the sliced list of tuples
    return sliced_list
Ejemplo n.º 24
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                w = w.lower()
                value = 1
                if ht.contains_key(w):
                    value = ht.get(w) + 1
                    ht.put(w, value)
                else:
                    ht.put(w, value)

    temp_list = ht.word_count_list()
    lst = len(temp_list)
    for i in range(0, lst):
        for j in range(0, lst - i - 1):
            if temp_list[j][1] < temp_list[j + 1][1]:
                temp_list[j], temp_list[j + 1] = temp_list[j + 1], temp_list[j]

    final_tuples = []
    for k in range(0, number):
        final_tuples.append(temp_list[k])

    return final_tuples
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    result = []

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word at a time
    # and add it to the hash map
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                lw = w.lower()

                # If the word is not in the hash map, add it with a value of 1
                if not ht.contains_key(lw):
                    ht.put(lw, 1)
                else:
                    # Otherwise, update the value by increasing it by 1
                    ht.put(lw, ht.get(lw) + 1)

    for bucket in ht._buckets:
        cur = bucket.head
        while cur is not None:
            result.append((cur.key, cur.value))
            cur = cur.next

    print(ht.table_load())
    print(ht.empty_buckets())

    sort_words(result)
    return result[:number]
Ejemplo n.º 26
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()
    lst = []

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                key = w.lower()
                value = ht.get(key)
                if value is not None:
                    ht.put(key, value + 1)
                else:
                    ht.put(key, 1)

    for i in range(ht.capacity):
        ll = ht._buckets[i]
        cur = ll.head
        while cur is not None:
            lst.append((cur.key, cur.value))
            cur = cur.next

    lst.sort(key=lambda x: x[1], reverse=True)
    if number <= len(lst):
        return lst[:number]
    else:
        return lst
Ejemplo n.º 27
0
def top_words(source, number):
    """
        Takes a plain text file and counts the number of occurrences of case insensitive words.
        Returns the top `number` of words in a list of tuples of the form (word, count).

        Args:
            source: the file name containing the text
            number: the number of top results to return (e.g. 5 would return the 5 most common words)
        Returns:
            A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
        """
    keys = set()
    word_list = []

    ht = HashMap(2500,hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            count = 1
            for w in words:
                w = w.lower()
                ht.put_count(w,1)    # using helper put method to just add the key and keep track of value
                word_list.append(w)  # creating a new list of just words


    word_list = list(dict.fromkeys(word_list)) # removing duplicates
    word_tuple = []

    for i in word_list:
        word_tuple.append((i,ht.get(i))) #appends a tuple with the word key and its corresponding value


    word_tuple = sorted(word_tuple, key=lambda x: x[1],reverse=True) # sorting the tuple from Greatest to Least based on Value
    top_list = []

    for i in range(number):
        top_list.append(word_tuple[i])  # appending tuple index key/value up to arg number, keys with highest value at index 0
    return top_list
Ejemplo n.º 28
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """
    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    # put the word in `w`. It should be left as starter code.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                w = w.lower()
                count = ht.get(w)
                if ht.contains_key(w):
                    ht.put(w, count + 1)
                else:
                    ht.put(w, 1)

    # Add all items in hash table to list
    sorted_list = []
    for list in ht.get_buckets():
        current = list.head
        for tuple in range(list.size):
            sorted_list.append((current.key, current.value))
            current = current.next

    # Sort list by value in descending order. Return given number of 'top-words'.
    sorted_list = sorted(sorted_list, key=get_second, reverse=True)
    return_list = []
    for i in range(number):
        return_list.append(sorted_list[i])
    return return_list
Ejemplo n.º 29
0
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    # keys = set()

    ht = HashMap(25,hash_function_2)

    tuple_list = []

    # This block of code will read a file one word as a time and
    # put the word in `w`.
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                w = w.lower()
                if ht.contains_key(w):
                    # inc count
                    ht.put(w, ht.get(w) +1)

                else:
                    # start count
                    ht.put(w, 1)

        for bucket in ht._buckets:
            current = bucket.head
            while current is not None:
                tuple_list.append((current.key, current.value))
                current = current.next
        sort_tuples(tuple_list)
        return tuple_list[:number]
def top_words(source, number):
    """
    Takes a plain text file and counts the number of occurrences of case insensitive words.
    Returns the top `number` of words in a list of tuples of the form (word, count).

    Args:
        source: the file name containing the text
        number: the number of top results to return (e.g. 5 would return the 5 most common words)
    Returns:
        A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
    """

    keys = set()

    ht = HashMap(2500, hash_function_2)

    # This block of code will read a file one word as a time and
    with open(source) as f:
        for line in f:
            words = rgx.findall(line)
            for w in words:
                # FIXME: Complete this function
                word = w.lower()
                if ht.contains_key(word) == False:
                    ht.put(word, 1)
                else:
                    curVal = ht.get(word)
                    newVal = curVal + 1
                    ht.put(word, newVal)
                    curVal = 0
    newList = []
    printTuple = ()
    listWords = ht.bucket_keys(
    )  #fills a list with all nodes in bucketlist and then sorts them
    listWords.sort(key=get_value, reverse=True)
    for i in range(number):  #takes the top 5 words and puts them in a list
        newList.append(listWords[i].returnNode())
    printTuple = tuple(newList)  #convert list into tuple
    return printTuple
class HashMapTests(unittest.TestCase, DictTestCases):
    def setUp(self):
        self.uut = HashMap()

    def mock_hashes_to(self, index=0):
        class Mock(object):
            def __hash__(self):
                return index

            def __str__(self):
                return "mock(%s)" % index

            __repr__ = __str__

        return Mock()

    def test_initial_current_capacity_is_16(self):
        self.assertEqual(16, self.uut.capacity())

    def test_initial_doubling_size_is_12(self):
        self.assertEqual(12, self.uut.doubling_size())

    def test_when_inialized_with_one_half_then_doubling_size_is_8(self):
        uut = HashMap(0.5)
        self.assertEqual(8, uut.doubling_size())

    def test_initial_len_is_0(self):
        self.assertEqual(0, len(self.uut))

    def test_insertion_increses_size_to_1(self):
        self.uut.insert(self.mock_hashes_to(), 42)

    def test_collisions_are_handled(self):
        first = self.mock_hashes_to(1)
        second = self.mock_hashes_to(1)
        self.uut.insert(first, "spam")
        self.uut.insert(second, "eggs")
        self.assertEqual("spam", self.uut.get(first))
        self.assertEqual("eggs", self.uut.get(second))

    def test_inserting_items_with_a_higher_value_works(self):
        item = self.mock_hashes_to(99)
        self.uut.insert(item, 42)

        self.assertEqual(42, self.uut.get(item))

    def test_when_at_doubling_size_then_the_capacity_doubles(self):
        for i in xrange(11):
            self.uut.insert(i, "_")

        self.assertEqual(16, self.uut.capacity())

        self.uut.insert(12, "_")

        self.assertEqual(32, self.uut.capacity())
        self.assertEqual(12, len(self.uut))

    def test_len_is_0_after_delete_of_empty(self):
        self.uut.delete("foo")
        self.assertEqual(0, len(self.uut))

    def test_len_is_0_after_delete_of_only_item(self):
        self.uut.insert("foo", "_")
        self.uut.delete("foo")
        self.assertEqual(0, len(self.uut))

    def test_len_is_0_after_delete_of_only_item_twice(self):
        self.uut.insert("foo", "_")
        self.uut.delete("foo")
        self.uut.delete("foo")
        self.assertEqual(0, len(self.uut))
Ejemplo n.º 32
0
    def test_hash_map(self):
        test_map = HashMap()
        for i in range(0,1000):
            if i%2 is 0:
                test_map.put((i,), i)
            else: # i%2 is 1
                test_map.put(str(i), [i])

        # initialize the map with 1001 kv pairs
        self.assertEquals(test_map.size(), 1000)
        for i in range(0,1000):
            if i%2 is 0:
                self.assertEquals(test_map.get((i,)), i)
            else: # i%2 is 1
                self.assertEquals(test_map.get(str(i)), [i])

        with self.assertRaises(KeyNotFound):
            test_map.get("8")

        with self.assertRaises(KeyNotFound):
            test_map.get((991,))

        with self.assertRaises(KeyNotFound):
            test_map.get("test_key")

        with self.assertRaises(KeyNotFound):
            test_map.get((1002,))

        # remove 700 elements
        for i in range(100,800):
            if i%2 is 0:
                test_map.remove((i,))
            else: # i%2 is 1
                test_map.remove(str(i))

        self.assertEquals(test_map.size(), 300)

        self.assertTrue(test_map.contains((80,)))
        self.assertFalse(test_map.contains((120,)))

        # insert 9000 more values
        for i in range(1000,10000):
            if i%2 is 1:
                test_map.put((i,), i)
            else: # i%2 is 0
                test_map.put(str(i), [i])

        self.assertEquals(test_map.size(), 9300)

        for i in range(1000,10000):
            if i%2 is 1:
                self.assertEquals(test_map.get((i,)), i)
            else: # i%2 is 0
                self.assertEquals(test_map.get(str(i)), [i])