def main(data_file, table_size, hash_alg, collision_strategy, num_new_keys,
         out_file):
    """
    This function uses the command line arguments to
    instantiate a hash table, hash a user-defined number
    of keys to the table using a user-defined collision
    resolution strategy, times the inserts or searches,
    and graphs the result to a scatter plot

    Parameters:
    - data_file(str): A data file of keys to hash
    - table_size(int): The size of the hash table
    - hash_alg(str): h_ascii, h_rolling, or h_mult
    - collision_strategy(str): linear_probing or chaining
    - num_new_key(int): The number of new keys to hash
    - out_file(str): The final scatterplot file

    Returns:
    - N/A; a scatter plot is created

    """
    if hash_alg == 'h_ascii':

        if collision_strategy == 'linear_probing':
            ht = LinearProbe(table_size, h_ascii)
        elif collision_strategy == 'chaining':
            ht = ChainedHash(table_size, h_ascii)

    elif hash_alg == 'h_rolling':

        if collision_strategy == 'linear_probing':
            ht = LinearProbe(table_size, h_rolling)
        elif collision_strategy == 'chaining':
            ht = ChainedHash(table_size, h_rolling)

    elif hash_alg == 'h_mult':

        if collision_strategy == 'linear_probing':
            ht = LinearProbe(table_size, h_mult)
        elif collision_strategy == 'chaining':
            ht = ChainedHash(table_size, h_mult)

    x_data = []
    y_data = []

    for line in open(data_file):
        ht.add(line, "Value")
        if ht.m == num_new_keys:
            break
        line = line.strip()
        t0 = time.time()
        # ht.add(line, "Value")
        ht.search(line)
        t1 = time.time()
        x_data.append(ht.m / ht.n)
        y_data.append(t1 - t0)

    scatter_plot(x_data, y_data, "Load factor", "Time to search",
                 "Search Performance: " + collision_strategy + " " + hash_alg,
                 out_file)
    def test_keys_not_exist(self):
        """
        This test checks to see if keys list is empty when initialized

        """
        self.n = 10
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.assertEqual([], self.inst_table.keys)
    def test_hash_chain_search_empty(self):
        """
        This test checks LinearProbe search function on an empty table
        where the key exists

        """
        self.n = 10
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.assertEqual(None, self.inst_table.search(chr(5)))
    def test_hash_chain_empty(self):
        """
        This test checks ChainedHash add function on a empty table

        """
        self.n = 10
        self.value = "Hello"
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.assertTrue(self.inst_table.add(self.key, self.value))
    def test_keys_exist(self):
        """
        This test checks to see if keys are tracked in the keys list

        """
        self.n = 10
        self.key = "TEST"
        self.value = 1
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.inst_table.add(self.key, self.value)
        self.assertEqual(self.key, self.inst_table.keys[0])
    def test_hash_chain_search_full(self):
        """
        This test checks LinearProbe search function on a full table
        with multiple elementes in each linked list
        """
        self.n = 10
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.inst_table.table = [[(chr(x), x), (chr(x + self.n), x + self.n)]
                                 for x in range(self.n)]

        self.assertEqual(5, self.inst_table.search(chr(5))[0])
    def test_hash_chain_full(self):
        """
        This test checks ChainedHash add function on a table with
        one entry in each linked list

        """
        self.n = 10
        self.value = "Hello"
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.inst_table.table = [[(chr(x), x)] for x in range(self.n)]
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.assertTrue(self.inst_table.add(self.key, self.value))
    def test_hash_lp_search_full(self):
        """
        This test checks LinearProbe search function on a full table
        where the key exists

        """
        self.n = 10
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.inst_table.table = [(chr(x), x) for x in range(self.n)]
        self.assertTrue(1, self.inst_table.search(chr(1)))
    def test_hash_lp_search_empty(self):
        """
        This test checks LinearProbe search function on empty table
        where the key does not exist

        """
        self.n = 10
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.assertEqual(None, self.inst_table.search(self.key))
    def test_hash_lp_full(self):
        """
        This test checks LinearProbe add function on a full table

        """
        self.n = 10
        self.value = "Hello"
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.inst_table.table = ["Full" for i in range(self.n)]
        self.assertFalse(self.inst_table.add(self.key, self.value))
    def test_hash_lp_search_partial(self):
        """
        This test checks LinearProbe search function on a partially full table
        where the key does not exist

        """
        self.n = 10
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.inst_table.table = [(chr(x), x) for x in range(self.n)]
        self.inst_table.table[5] = (chr(15), 15)
        self.inst_table.table[6] = None
        self.assertEqual(None, self.inst_table.search(chr(5)))
Exemple #12
0
 def testChainedHashSearchChain(self):
     test_obj = ChainedHash(10, hf.h_ascii)
     test_obj.add("a", 1)
     test_obj.add("k", "return this, not 1")
     self.assertEqual("return this, not 1", test_obj.search("k"))
Exemple #13
0
 def testChainedHash(self):
     test_obj = ChainedHash(10, hf.h_ascii)
     for i in range(0, 10):
         test_obj.add("a" + str(i), 1 + i)
     self.assertEqual(True, test_obj.add("a0", 1))
class TestHash(unittest.TestCase):
    def make_rand_ascii_hash(self, n):
        """
        This helper function creates a random string of length 10 and
        calculates its hash value using the h_ascii strategy.

        """
        self.asciiDict = {chr(i): i for i in range(129)}
        self.rand_string = ''.join(random.choices(string.ascii_letters, k=10))
        self.sum = 0
        for char in self.rand_string:
            self.sum += self.asciiDict[char]
        return self.rand_string, self.sum % n

    def make_rand_mult_hash(self, n):
        """
        This helper function creates a random string of length 1- and
        calculates its hash value using the multiplicative hash strategy.

        """
        self.A = (sqrt(5) - 1) / 2
        self.asciiDict = {chr(i): i for i in range(129)}
        self.rand_string = ''.join(random.choices(string.ascii_letters, k=10))
        self.sum = 0
        for char in self.rand_string:
            self.sum += self.asciiDict[char]

        return self.rand_string, floor(self.n * ((self.sum * self.A) % 1))

    def make_rand_rolling_hash(self, n):
        """
        This helper function creates a random string of length 10 and
        calculates its hash value using the poly. rolling hash strategy.

        """
        self.m = 2**64
        self.p = 53
        self.asciiDict = {chr(i): i for i in range(129)}
        self.rand_string = ''.join(random.choices(string.ascii_letters, k=10))
        self.sum = 0
        for c, char in enumerate(self.rand_string):
            self.sum += self.asciiDict[char] * self.p**c
        return self.rand_string, (self.sum % self.m) % self.n

    def test_ascii_hash_empty(self):
        """
        This test checks h_ascii function on an empty string

        """
        self.n = 10
        self.assertEqual(0, h_ascii("", self.n))

    def test_ascii_hash(self):
        """
        This test checks h_ascii function on a random string of length 10

        """
        self.n = 10
        self.rand_string, self.hash_value = self.make_rand_ascii_hash(self.n)

        self.assertEqual(self.hash_value, h_ascii(self.rand_string, self.n))

    def test_asci_hash_no_table(self):
        """
        This test checks h_ascii function on a random string of length 10
        when n = 0

        """

        self.n = 0
        self.rand_string = ''.join(random.choices(string.ascii_letters, k=10))
        self.assertRaises(ZeroDivisionError and SystemExit, h_ascii,
                          self.rand_string, self.n)

    def test_mult_hash_empty(self):
        """
        This test checks h_ascii function on an empty string

        """
        self.n = 10
        self.assertEqual(0, h_mult("", self.n))

    def test_mult_hash(self):
        """
        This test checks h_ascii function on a random string of length 10

        """
        self.n = 10
        self.rand_string, self.hash_value = self.make_rand_mult_hash(self.n)
        self.assertEqual(self.hash_value, h_mult(self.rand_string, self.n))

    def test_rolling_hash_empty(self):
        """
        This test checks  h_rolling function on an empty string

        """
        self.n = 10
        self.assertEqual(0, h_rolling("", self.n))

    def test_rolling_hash(self):
        """
        This test checks h_rolling function on a random string of length 10

        """
        self.n = 10
        self.rand_string, self.hash_value = self.make_rand_rolling_hash(self.n)
        self.assertEqual(self.hash_value, h_rolling(self.rand_string, self.n))

    def test_rolling_hash_no_table(self):
        """
        This test checks h_rolling function on a random string of length 10
        when n = 0

        """
        self.n = 0
        self.rand_string = ''.join(random.choices(string.ascii_letters, k=10))

        self.assertRaises(ZeroDivisionError and SystemExit, h_rolling,
                          self.rand_string, self.n)

    def test_hash_lp_empty(self):
        """
        This test checks LinearProbe add function on a empty table

        """
        self.n = 10
        self.value = "Hello"
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.assertTrue(self.inst_table.add(self.key, self.value))

    def test_hash_lp_full(self):
        """
        This test checks LinearProbe add function on a full table

        """
        self.n = 10
        self.value = "Hello"
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.inst_table.table = ["Full" for i in range(self.n)]
        self.assertFalse(self.inst_table.add(self.key, self.value))

    def test_hash_lp_search_empty(self):
        """
        This test checks LinearProbe search function on empty table
        where the key does not exist

        """
        self.n = 10
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.assertEqual(None, self.inst_table.search(self.key))

    def test_hash_lp_search_full(self):
        """
        This test checks LinearProbe search function on a full table
        where the key exists

        """
        self.n = 10
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.inst_table.table = [(chr(x), x) for x in range(self.n)]
        self.assertTrue(1, self.inst_table.search(chr(1)))

    def test_hash_lp_search_partial(self):
        """
        This test checks LinearProbe search function on a partially full table
        where the key does not exist

        """
        self.n = 10
        self.inst_table = LinearProbe(self.n, h_ascii)
        self.inst_table.table = [(chr(x), x) for x in range(self.n)]
        self.inst_table.table[5] = (chr(15), 15)
        self.inst_table.table[6] = None
        self.assertEqual(None, self.inst_table.search(chr(5)))

    def test_hash_chain_empty(self):
        """
        This test checks ChainedHash add function on a empty table

        """
        self.n = 10
        self.value = "Hello"
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.assertTrue(self.inst_table.add(self.key, self.value))

    def test_hash_chain_full(self):
        """
        This test checks ChainedHash add function on a table with
        one entry in each linked list

        """
        self.n = 10
        self.value = "Hello"
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.inst_table.table = [[(chr(x), x)] for x in range(self.n)]
        self.key, self.hash_value = self.make_rand_ascii_hash(self.n)
        self.assertTrue(self.inst_table.add(self.key, self.value))

    def test_hash_chain_search_full(self):
        """
        This test checks LinearProbe search function on a full table
        with multiple elementes in each linked list
        """
        self.n = 10
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.inst_table.table = [[(chr(x), x), (chr(x + self.n), x + self.n)]
                                 for x in range(self.n)]

        self.assertEqual(5, self.inst_table.search(chr(5))[0])

    def test_hash_chain_search_empty(self):
        """
        This test checks LinearProbe search function on an empty table
        where the key exists

        """
        self.n = 10
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.assertEqual(None, self.inst_table.search(chr(5)))

    def test_boxplot_exist(self):
        """
        This test checks to see if file exists after scatterplot is created

        """
        self.x_data = [x for x in range(10)]
        self.y_data = self.x_data
        self.x_label = "X"
        self.y_label = "Y"
        self.title = "Title"
        self.outfile = "test.png"
        scatter_plot(self.x_data, self.y_data, self.x_label, self.y_label,
                     self.title, self.outfile)
        self.assertEqual(True, os.path.exists(self.outfile))

    def test_keys_exist(self):
        """
        This test checks to see if keys are tracked in the keys list

        """
        self.n = 10
        self.key = "TEST"
        self.value = 1
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.inst_table.add(self.key, self.value)
        self.assertEqual(self.key, self.inst_table.keys[0])

    def test_keys_not_exist(self):
        """
        This test checks to see if keys list is empty when initialized

        """
        self.n = 10
        self.inst_table = ChainedHash(self.n, h_ascii)
        self.assertEqual([], self.inst_table.keys)
Exemple #15
0
def hash_process(gene_reads, sample_attributes, gene, group_types,
                 output_file):
    """
    This function calculates the gene expression distribution across either
    tissue groups (SMTS) or tissue type (SMTSD) for a target gene. It uses
    hash tables for O(1) lookups. A series of box plots is generated.

    Parameters:
    - gene_reads: (see next line)
    GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.acmg_59.gct.gz
    - sample_attributes: GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
    - gene: The target gene
    - group_types: Tissue group or type
    - output_file: File for saving the box plot (.png)

    """
    sample_id_col_name = 'SAMPID'
    sample_info_header = None

    # Initialize tissue-sample hash table
    h_table_samples = ChainedHash(1000, h_rolling)

    # Read and preprocess the data lines
    for line in open(sample_attributes):
        if sample_info_header is None:
            sample_info_header = line.rstrip().split('\t')

            # Find the proper columns containing the group types and sample id's
            # using linear search
            group_col_idx = linear_search(group_types, sample_info_header)
            sample_id_col_idx = linear_search(sample_id_col_name,
                                              sample_info_header)

    # Add samples to hash table
        else:
            line = line.rstrip().split('\t')
            group = line[group_col_idx]
            sample = line[sample_id_col_idx]
            h_table_samples.add(key=group, value=sample)

    version = None
    dim = None
    data_header = None
    gene_name_col = 1

    # Initialize sample-count hash table
    h_table_counts = ChainedHash(100000, h_rolling)

    # Read and preprocess the gene reads data lines
    for line in gzip.open(gene_reads, 'rt'):
        if version is None:
            version = line
            continue

        if dim is None:
            dim = [int(x) for x in line.rstrip().split()]
            continue

        if data_header is None:
            data_header = line.rstrip().split('\t')
            continue

        line = line.rstrip().split('\t')
        if line[gene_name_col] == gene:
            gene_row = line

    for sample, count in zip(data_header, gene_row):
        h_table_counts.add(sample, count)

    # Get the counts for each sample of each tissue type
    group_counts = []
    for tissue in h_table_samples.keys:
        counts = []
        for sample in h_table_samples.search(tissue):
            count = h_table_counts.search(sample)
            if count is not None:
                counts.append(int(count[0]))
        group_counts.append(counts)

    # Generate box plot
    boxplot(group_counts, h_table_samples.keys, gene, group_types,
            "Gene read counts", output_file)