Beispiel #1
0
 def __init__(self, objs=[], hash_type=HashType.QUADRATIC, size=10, c1=1, c2=1):
     # if the hash table uses quadratic probing, then round the size to the nearest power of 2
     if hash_type is HashType.QUADRATIC:
         from math import ceil
         from math import log
         self.size = pow(2, ceil(log(size) / log(2)))
     else:
         self.size = size
     self.item_count = 0
     # Initialize the table
     self.table = []
     # Check that all values are positive
     for obj in objs:
         if HashTable.to_value(obj) < 0:
             print("HashTable only takes positive values.")
             return
     # Chain hash tables have a list at each index
     if hash_type is HashType.CHAIN:
         for i in range(self.size):
             self.table.append([])
     # open addressing hash tables need empty_since_start initialization
     else:
         # Index is a list of index numbers that contain items; it's only needed for open addressed hash tables
         self.index = AVL()
         for i in range(self.size):
             self.table.append(EmptyType.EMPTY_SINCE_START)
     # Initialize other values
     self.hash_type = hash_type
     self.c1 = c1
     self.c2 = c2
     if objs:
         for obj in objs:
             if len(str(obj)) > 0:
                 self.insert(obj)
Beispiel #2
0
def plot_times(filename="English.txt", start=500, stop=5500, step=500):
    """Vary n from 'start' to 'stop', incrementing by 'step'. At each
    iteration, use the create_word_list() from the 'WordList' module to
    generate a list of n randomized words from the specified file.
    
    Time (separately) how long it takes to load a LinkedList, a BST, and
    an AVL with the data set.
    
    Choose 5 random words from the data set. Time how long it takes to
    find each word in each object. Calculate the average search time for
    each object.
    
    Create one plot with two subplots. In the first subplot, plot the
    number of words in each dataset against the build time for each object.
    In the second subplot, plot the number of words against the search time
    for each object.
    
    Inputs:
        filename (str): the file to use in creating the data sets.
        start (int): the lower bound on the sample interval.
        stop (int): the upper bound on the sample interval.
        step (int): the space between points in the sample interval.
    
    Returns:
        Show the plot, but do not return any values.
    """
    interval = (stop-start)/step
    n_list = np.linspace(start,stop,interval+1)
    n_list = np.int16(n_list)
    

    word_list = create_word_list(filename)
    
    load_list = []
    load_BST = []
    load_AVL = []
    
    find_list = []
    find_BST = []
    find_AVL = []
    
    for n in n_list:
        temp_word_list = word_list[:n]
        random_word_indices = np.random.randint(0,n,size=5)
        words_to_find = []
        for x in random_word_indices:
            words_to_find.append(temp_word_list[x])

        L = LinkedList()
        B = BST()
        A = AVL()
        
        start = time()
        for word in temp_word_list:
            L.add(word)
        end = time()
        load_list.append(end-start)

        start = time()
        for word in temp_word_list:
            B.insert(word)
        end = time()
        load_BST.append(end-start)

        start = time()
        for word in temp_word_list:
            A.insert(word)
        end = time()
        load_AVL.append(end-start)
        
        start = time()
        for word in words_to_find:
            iterative_search(L, word)
        end = time()
        find_list.append(end-start)

        start = time()
        for word in words_to_find:
            B.find(word)
        end = time()
        find_BST.append(end-start)

        start = time()
        for word in words_to_find:
            A.find(word)
        end = time()
        find_AVL.append(end-start)
    
    avg_find_list = sum(find_list[:])/5.
    avg_find_BST = sum(find_BST[:])/5.
    avg_find_AVL = sum(find_AVL[:])/5.

    plt.subplot(121)
    list_plot1 = plt.plot(n_list, load_list,label='Singly-Linked List')
    BST_plot1 = plt.plot(n_list, load_BST, label='Binary Search Tree')
    AVL_plot1 = plt.plot(n_list, load_AVL, label='AVL Tree')
    plt.legend()
    plt.xlabel('Data Points')
    plt.ylabel('Seconds')
    plt.title('Build Times')

    plt.subplot(122)
    list_plot2 = plt.plot(n_list, find_list,label='Singly-Linked List')
    BST_plot2 = plt.plot(n_list, find_BST, label='Binary Search Tree')
    AVL_plot2 = plt.plot(n_list, find_AVL, label='AVL Tree')
    plt.legend()
    plt.xlabel('Data Points')
    plt.ylabel('Seconds')
    plt.title('Search Times')

    plt.show() 
Beispiel #3
0
class HashTable:
    def __init__(self, objs=[], hash_type=HashType.QUADRATIC, size=10, c1=1, c2=1):
        # if the hash table uses quadratic probing, then round the size to the nearest power of 2
        if hash_type is HashType.QUADRATIC:
            from math import ceil
            from math import log
            self.size = pow(2, ceil(log(size) / log(2)))
        else:
            self.size = size
        self.item_count = 0
        # Initialize the table
        self.table = []
        # Check that all values are positive
        for obj in objs:
            if HashTable.to_value(obj) < 0:
                print("HashTable only takes positive values.")
                return
        # Chain hash tables have a list at each index
        if hash_type is HashType.CHAIN:
            for i in range(self.size):
                self.table.append([])
        # open addressing hash tables need empty_since_start initialization
        else:
            # Index is a list of index numbers that contain items; it's only needed for open addressed hash tables
            self.index = AVL()
            for i in range(self.size):
                self.table.append(EmptyType.EMPTY_SINCE_START)
        # Initialize other values
        self.hash_type = hash_type
        self.c1 = c1
        self.c2 = c2
        if objs:
            for obj in objs:
                if len(str(obj)) > 0:
                    self.insert(obj)

    @staticmethod
    def to_value(obj):
        if isinstance(obj, int):
            return obj
        string = str(obj)
        total = 0
        for ch in string:
            if ch.isdigit():
                total += int(ch)
            elif ch.isalpha():
                total += ord(ch)
        return total

    # Calculates the hash location based on the value and mode
    # Insert mode finds the first available bucket
    # Search mode keeps searching until either an empty_from_start bucket or a number of searches == table size is
    # reached.
    def calculate_hash(self, value, mode="insert"):
        if self.hash_type == HashType.CHAIN:
            return value % self.size
        if mode == "insert":
            if self.hash_type == HashType.LINEAR:
                bucket = value % self.size
                # Keep searching until a free bucket is found, then insert the value and break the loop
                while True:
                    if self.table[bucket] is EmptyType.EMPTY_SINCE_START or \
                            self.table[bucket] is EmptyType.EMPTY_AFTER_REMOVAL:
                        return bucket
                    else:
                        bucket = (bucket + 1) % self.size
            elif self.hash_type == HashType.QUADRATIC:
                i = 0
                bucket = (value + self.c1 * i + self.c2 * i**2) % self.size
                # Keep searching until a free bucket is found, then insert the value and break the loop
                while True:
                    if self.table[bucket] is EmptyType.EMPTY_SINCE_START or \
                            self.table[bucket] is EmptyType.EMPTY_AFTER_REMOVAL:
                        return bucket
                    else:
                        i += 1
                        bucket = (value + self.c1 * i + self.c2 * i ** 2) % self.size
        elif mode == "search":
            count = 0
            if self.hash_type == HashType.LINEAR:
                bucket = value % self.size
                while self.table[bucket] is EmptyType.EMPTY_AFTER_REMOVAL and count < self.size:
                    bucket = (bucket + 1) % self.size
                    count += 1
                # if the item is not found in the search, then return None as a failure
                if count >= self.size:
                    return None
                return bucket
            elif self.hash_type == HashType.QUADRATIC:
                i = 0
                bucket = (value + self.c1 * i + self.c2 * i ** 2) % self.size
                while self.table[bucket] is EmptyType.EMPTY_AFTER_REMOVAL and count < self.size:
                    i += 1
                    bucket = (value + self.c1 * i + self.c2 * i ** 2) % self.size
                    count += 1
                # if the item is not found in the search, then return None as a failure
                if count >= self.size or self.table[bucket] is EmptyType.EMPTY_SINCE_START:
                    return None
                return bucket

    def insert(self, obj, value=None):
        # Validations/Setups
        # Quadratic HashTables can only guarantee a spot if the table is half full or less
        if self.hash_type is HashType.QUADRATIC and self.item_count * 2 >= self.size:
            self.expand()
        # Chain hash tables don't need to expand, they deal with collisions in an alternate way
        # Other hashtable types should not get too full in order to stay efficient
        elif (self.hash_type is not HashType.CHAIN) and self.item_count * 1.5 >= self.size:
            self.expand()

        if value is None:
            # Don't insert if negative value
            if self.to_value(obj) < 0 or value < 0:
                print("Hash table only accepts positive values.")
                return
            value = self.calculate_hash(self.to_value(obj))

        # Calculate the hash to insert to
        value = self.calculate_hash(value)
        if self.hash_type is HashType.CHAIN:
            self.table[value].append(obj)
        else:
            self.table[value] = obj
            self.index.add(value)
        self.item_count += 1

    # This method expands the HashTable and recalculates the old values
    def expand(self, multiplication_factor=2):
        # Copy the table to another instance
        old_table = self.table.copy()
        # Get the index before it's cleared if this is an open addressed hash table
        traversal = []
        if self.hash_type is not HashType.CHAIN:
            self.index.inorder()
            nodes = self.index.ordered_nodes
            for node in nodes:
                if node is not None:
                    traversal.append(node.value)
        # Reset the table and index
        self.table.clear()
        self.index.clear()
        # Item count will increment when items are re-added
        self.item_count = 0
        # Expand the size
        self.size *= multiplication_factor
        # Initialization is different for CHAIN versus others
        if self.hash_type is HashType.CHAIN:
            # Expand the size
            for i in range(self.size):
                self.table.append([])
            # Set new size
            self.size = len(self.table)
            # Recalculate old values
            for old_bucket in old_table:
                for item in old_bucket:
                    self.insert(item)
        else:
            for i in range(self.size):
                self.table.append(EmptyType.EMPTY_SINCE_START)
            # Recalculate old values
            for num in traversal:
                try:
                    self.insert(old_table[num], old_table[num].number)
                except:
                    self.insert(old_table[num], num)

    def remove(self, obj):
        if self.to_value(obj) < 0:
            print("Hash table values cannot be negative.")
            return
        value = HashTable.to_value(obj)
        value = self.calculate_hash(value, "search")
        if self.hash_type == HashType.CHAIN:
            for i in range(len(self.table[value])):
                if self.table[value][i] == obj:
                    self.table[value].pop(i)
                    self.item_count -= 1
        elif value is not None and self.index.search(value) is not None:
            self.table[value] = EmptyType.EMPTY_AFTER_REMOVAL
            self.index.remove(self.index.search(value))
            self.item_count -= 1
        else:
            print(f"Object not found, cannot remove.")

    def search(self, value):
        value = self.calculate_hash(value, "search")
        if value is None:
            return None
        return self.table[self.calculate_hash(value, "search")]

    def is_in_table(self, obj):
        value = HashTable.to_value(obj)
        value = self.calculate_hash(value, "search")
        return value > -1

    def return_entire_table(self):
        return_list = []
        if self.hash_type == HashType.CHAIN:
            for bucket in self.table:
                for item in bucket:
                    return_list.append(item)
        else:
            self.index.inorder()
            for node in self.index.ordered_nodes:
                if node is not None:
                    return_list.append(self.table[node.value])
        return return_list

    def get_index(self):
        if self.hash_type != HashType.CHAIN:
            values = []
            self.index.inorder()
            for node in self.index.ordered_nodes:
                if node is not None:
                    values.append(node.value)
            return values
        else:
            print("Chained hash tables don't have indexes.")
            return
Beispiel #4
0
def plot_times(filename="English.txt", start=500, stop=5500, step=500):
    """Vary n from 'start' to 'stop', incrementing by 'step'. At each
    iteration, use the create_word_list() from the 'WordList' module to
    generate a list of n randomized words from the specified file.
    
    Time (separately) how long it takes to load a LinkedList, a BST, and
    an AVL with the data set.
    
    Choose 5 random words from the data set. Time how long it takes to
    find each word in each object. Calculate the average search time for
    each object.
    
    Create one plot with two subplots. In the first subplot, plot the
    number of words in each dataset against the build time for each object.
    In the second subplot, plot the number of words against the search time
    for each object.
    
    Inputs:
        filename (str): the file to use in creating the data sets.
        start (int): the lower bound on the sample interval.
        stop (int): the upper bound on the sample interval.
        step (int): the space between points in the sample interval.
    
    Returns:
        Show the plot, but do not return any values.
    """

    def wrapper(func, *args, **kwargs):
        def wrapped():
            return func(*args, **kwargs)
        return wrapped

    def add_all(A, my_list):
        for x in my_list:
            A.add(x)

    def add_all_tree(A, my_list):
        for x in my_list:
            A.insert(x)

    def find_it(A, to_find):
        A.find(to_find)

    def find_average(A, my_list):
        find_times = []
        for x in range(5):
            to_find = random.choice(my_list)
            # to_find = my_list[x]
            wrapped = wrapper(find_it, A, to_find)
            find_times.append(timeit.timeit(wrapped, number=1))
        return np.mean(find_times)





    word_list = WordList.create_word_list()
    word_list = np.random.permutation(word_list)
    x_values = range(start, stop, step)
    list_times = []
    bst_times = []
    avl_times = []
    find_list= []
    find_bst= []
    find_avl= []
    A = LinkedList()
    B = BST()
    C = AVL()

    for x in x_values:
        wrapped = wrapper(add_all, A, word_list[:int(x)])
        list_times.append(timeit.timeit(wrapped, number=1))
        find_list.append(find_average(A, word_list[:int(x)]))
        A.clear()


    for x in x_values:
        wrapped = wrapper(add_all_tree, B, word_list[:int(x)])
        bst_times.append(timeit.timeit(wrapped, number=1))
        find_bst.append(find_average(B, word_list[:int(x)]))
        B.clear()

    for x in x_values:
        wrapped = wrapper(add_all_tree, C, word_list[:int(x)])
        avl_times.append(timeit.timeit(wrapped, number=1))
        find_avl.append(find_average(C, word_list[:int(x)]))
        C.clear()




    plt.subplot(121)
    plt.plot(x_values, list_times, label='Linked List')
    plt.plot(x_values, bst_times, label='BST')
    plt.plot(x_values, avl_times, label='AVL')
    plt.legend(loc='upper left')
    plt.xlabel('data points')
    plt.ylabel('seconds')

    plt.subplot(122)
    plt.plot(x_values, find_list,label='Linked List')
    plt.plot(x_values, find_bst, label='BST')
    plt.plot(x_values, find_avl, label='AVL')
    plt.legend(loc='upper left')
    plt.xlabel('data points')
    plt.ylabel('seconds')

    plt.show()

    plt.xlabel('data points')
def plot_times(filename="English.txt", start=500, stop=5500, step=500):
    """Vary n from 'start' to 'stop', incrementing by 'step'. At each
    iteration, use the create_word_list() from the 'WordList' module to
    generate a list of n randomized words from the specified file.
    
    Time (separately) how long it takes to load a LinkedList, a BST, and
    an AVL with the data set.
    
    Choose 5 random words from the data set. Time how long it takes to
    find each word in each object. Calculate the average search time for
    each object.
    
    Create one plot with two subplots. In the first subplot, plot the
    number of words in each dataset against the build time for each object.
    In the second subplot, plot the number of words against the search time
    for each object.
    
    Inputs:
        filename (str): the file to use in creating the data sets.
        start (int): the lower bound on the sample interval.
        stop (int): the upper bound on the sample interval.
        step (int): the space between points in the sample interval.
    
    Returns:
        Show the plot, but do not return any values.
    """

    def get_average_time_linked_list(to_search, linked_list, times_left, current_time = 0):
        while times_left > 0:
            start = time.time()
            iterative_search(linked_list, to_search[times_left-1])
            end =time.time()
            current_time +=(end-start)
            times_left -=1
        return current_time/len(to_search)

    def get_average_time_BST(to_search, BST_list, times_left, current_time =0):
        while times_left >0:
            start = time.time()
            BST_list.find(to_search[times_left-1])
            end = time.time()
            current_time +=(end-start)
            times_left -= 1 
        return current_time/len(to_search)
    def get_average_time_AVL(to_search, AVL_list, times_left, current_time = 0):
        while times_left > 0:
            start = time.time()
            AVL_list.find(to_search[times_left-1])
            end = time.time()
            current_time +=(end-start)
            times_left -= 1
        return current_time/len(to_search)


    word_list = create_word_list(filename)
    if (stop-start)%step!=0:
        raise ValueError("Your steps won't get you from start to stop")
    current = start
    time_linked_list = []
    time_BST_list = []
    time_AVL_list = []

    time_linked_list_search = []
    time_BST_list_search = []
    time_AVL_list_search = []

    set_size = []

    while current < stop:
        current_linked_list = LinkedList()
        current_BST = BST()
        current_AVL = AVL()
        current_list = word_list[:current]
        to_search = np.random.permutation(current_list)
        start_linked_time = time.time()

        for x in current_list:
            current_linked_list.add(x)
        end_linked_time = time.time()

        start_BST_time = time.time()
        for y in current_list:
            current_BST.insert(y)
        end_BST_time = time.time()

        start_AVL_time = time.time()
        for z in current_list:
            current_AVL.insert(z)
        end_AVL_time = time.time()

        time_linked_list.append(end_linked_time - start_linked_time)
        time_BST_list.append(end_BST_time - start_BST_time)
        time_AVL_list.append(end_AVL_time- start_AVL_time)

        time_linked_list_search.append(get_average_time_linked_list(to_search,current_linked_list, len(to_search)))
        time_BST_list_search.append(get_average_time_BST(to_search,current_BST, len(to_search)))
        time_AVL_list_search.append(get_average_time_AVL(to_search,current_AVL, len(to_search)))

        set_size.append(current)

        current+=step
    plt.subplot(2,1,1)
    plt.title('Building Data Structures')
    plt.plot(set_size,time_linked_list, label = 'Linked List', linewidth = 3)
    plt.plot(set_size, time_BST_list, label = "BST", linewidth = 3)
    plt.plot(set_size, time_AVL_list, label = "AVL", linewidth = 3)
    plt.legend(loc = 2)

    plt.subplot(2,1,2)
    plt.title("Searching Data Structures")
    plt.plot(set_size, time_linked_list_search, label = 'Linked list', linewidth = 3)
    plt.plot(set_size, time_BST_list_search, label = 'BST', linewidth = 3)
    plt.plot(set_size, time_AVL_list_search, label = 'AVL', linewidth = 3)
    plt.legend(loc = 2)
    plt.show()
Beispiel #6
0
def timings():
    ll = LinkedList()
    bst = BST()
    avl = AVL()

    ll_add = []
    bst_add = []
    avl_add = []

    ll_search = []
    bst_search = []
    avl_search = []

    for items in range(500,5500,500):
        wordlist = create_word_list(items)
        
        ll = LinkedList()
        before = time.time()
        for i in xrange(items):
            ll.add_node(wordlist[i])
        after = time.time()
        ll_add.append(after - before)
    
        random_indices = np.random.random_integers(0,items,5)
        temp = []
        for i in xrange(len(random_indices)):
            before = time.time()
            iterative_search(ll, wordlist[random_indices[i]]) 
            after = time.time()
            temp.append(after - before)
        ll_search.append(sum(temp)/len(temp))
    
        bst = BST()
        before = time.time()
        for i in xrange(items):
            bst.insert(wordlist[i])
        after = time.time()
        bst_add.append(after - before)

        temp = []
        for i in xrange(len(random_indices)):
            before = time.time()
            bst.find(wordlist[random_indices[i]])
            after = time.time()
            temp.append(after - before)
        bst_search.append(sum(temp)/len(temp))

        avl = AVL()
        before = time.time()
        for i in xrange(items):
            avl.insert(wordlist[i])
        after = time.time()
        avl_add.append(after - before)
        
        temp = []
        for i in xrange(len(random_indices)):
            before = time.time()
            avl.find(wordlist[random_indices[i]])
            after = time.time()
            temp.append(after - before)
        avl_search.append(sum(temp)/len(temp))
    
    plt.subplot(1,2,1)
    plt.plot(ll_add, "r")
    plt.plot(bst_add, "g")
    plt.plot(avl_add, "b")
    plt.subplot(1,2,2)
    plt.plot(ll_search, "r")
    plt.plot(bst_search, "g")
    plt.plot(avl_search, "b")
    plt.show()
    plt.close()
    
    return ll_add, ll_search, bst_add, bst_search, avl_add, avl_search
Beispiel #7
0
def plot_times(filename="English.txt", start=500, stop=5500, step=500):
    """Vary n from 'start' to 'stop', incrementing by 'step'. At each
    iteration, use the create_word_list() from the 'WordList' module to
    generate a list of n randomized words from the specified file.
    
    Time (separately) how long it takes to load a LinkedList, a BST, and
    an AVL with the data set.
    
    Choose 5 random words from the data set. Time how long it takes to
    find each word in each object. Calculate the average search time for
    each object.
    
    Create one plot with two subplots. In the first subplot, plot the
    number of words in each dataset against the build time for each object.
    In the second subplot, plot the number of words against the search time
    for each object.
    
    Inputs:
        filename (str): the file to use in creating the data sets.
        start (int): the lower bound on the sample interval.
        stop (int): the upper bound on the sample interval.
        step (int): the space between points in the sample interval.
    
    Returns:
        Show the plot, but do not return any values.
    """
    ll = LinkedList()
    bst = BST()
    avl = AVL()

    ll_add = []
    bst_add = []
    avl_add = []

    ll_search = []
    bst_search = []
    avl_search = []

    for items in range(start,stop,step):
        wordlist = create_word_list()[:items]
        
        ll = LinkedList()
        before = time.time()
        for i in xrange(items):
            ll.add(wordlist[i])
        after = time.time()
        ll_add.append(after - before)
    
        random_indices = np.random.random_integers(0,items,5)
        temp = []
        for i in xrange(len(random_indices)):
            before = time.time()
            iterative_search(ll, wordlist[random_indices[i]]) 
            after = time.time()
            temp.append(after - before)
        ll_search.append(sum(temp)/len(temp))
    
        bst = BST()
        before = time.time()
        for i in xrange(items):
            bst.insert(wordlist[i])
        after = time.time()
        bst_add.append(after - before)

        temp = []
        for i in xrange(len(random_indices)):
            before = time.time()
            bst.find(wordlist[random_indices[i]])
            after = time.time()
            temp.append(after - before)
        bst_search.append(sum(temp)/len(temp))

        avl = AVL()
        before = time.time()
        for i in xrange(items):
            avl.insert(wordlist[i])
        after = time.time()
        avl_add.append(after - before)
        
        temp = []
        for i in xrange(len(random_indices)):
            before = time.time()
            avl.find(wordlist[random_indices[i]])
            after = time.time()
            temp.append(after - before)
        avl_search.append(sum(temp)/len(temp))
    
    plt.subplot(1,2,1)
    plt.title("Build Times")
    plt.plot(ll_add, "b", label="Single-Linked List")
    plt.plot(bst_add, "g", label="Binary Search Tree")
    plt.plot(avl_add, "r", label ="AVL Tree")
    plt.legend(loc="upper left")
    plt.subplot(1,2,2)
    plt.title("Search Times")
    plt.plot(ll_search, "b", label="Single-Linked List")
    plt.plot(bst_search, "g", label="Binary Search Tree")
    plt.plot(avl_search, "r", label="AVL Tree")
    plt.legend(loc="upper left")
    plt.show()
    plt.close()
    
    return ll_add, ll_search, bst_add, bst_search, avl_add, avl_search