def main(data_file, table_size, hash_alg, collision_strategy, num_new_keys, out_file): """ This function uses the command line arguments to instantiate a hash table, hash a user-defined number of keys to the table using a user-defined collision resolution strategy, times the inserts or searches, and graphs the result to a scatter plot Parameters: - data_file(str): A data file of keys to hash - table_size(int): The size of the hash table - hash_alg(str): h_ascii, h_rolling, or h_mult - collision_strategy(str): linear_probing or chaining - num_new_key(int): The number of new keys to hash - out_file(str): The final scatterplot file Returns: - N/A; a scatter plot is created """ if hash_alg == 'h_ascii': if collision_strategy == 'linear_probing': ht = LinearProbe(table_size, h_ascii) elif collision_strategy == 'chaining': ht = ChainedHash(table_size, h_ascii) elif hash_alg == 'h_rolling': if collision_strategy == 'linear_probing': ht = LinearProbe(table_size, h_rolling) elif collision_strategy == 'chaining': ht = ChainedHash(table_size, h_rolling) elif hash_alg == 'h_mult': if collision_strategy == 'linear_probing': ht = LinearProbe(table_size, h_mult) elif collision_strategy == 'chaining': ht = ChainedHash(table_size, h_mult) x_data = [] y_data = [] for line in open(data_file): ht.add(line, "Value") if ht.m == num_new_keys: break line = line.strip() t0 = time.time() # ht.add(line, "Value") ht.search(line) t1 = time.time() x_data.append(ht.m / ht.n) y_data.append(t1 - t0) scatter_plot(x_data, y_data, "Load factor", "Time to search", "Search Performance: " + collision_strategy + " " + hash_alg, out_file)
def test_keys_not_exist(self): """ This test checks to see if keys list is empty when initialized """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.assertEqual([], self.inst_table.keys)
def test_hash_chain_search_empty(self): """ This test checks LinearProbe search function on an empty table where the key exists """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.assertEqual(None, self.inst_table.search(chr(5)))
def test_hash_chain_empty(self): """ This test checks ChainedHash add function on a empty table """ self.n = 10 self.value = "Hello" self.inst_table = ChainedHash(self.n, h_ascii) self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value))
def test_keys_exist(self): """ This test checks to see if keys are tracked in the keys list """ self.n = 10 self.key = "TEST" self.value = 1 self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.add(self.key, self.value) self.assertEqual(self.key, self.inst_table.keys[0])
def test_hash_chain_search_full(self): """ This test checks LinearProbe search function on a full table with multiple elementes in each linked list """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.table = [[(chr(x), x), (chr(x + self.n), x + self.n)] for x in range(self.n)] self.assertEqual(5, self.inst_table.search(chr(5))[0])
def test_hash_chain_full(self): """ This test checks ChainedHash add function on a table with one entry in each linked list """ self.n = 10 self.value = "Hello" self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.table = [[(chr(x), x)] for x in range(self.n)] self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value))
def test_hash_lp_search_full(self): """ This test checks LinearProbe search function on a full table where the key exists """ self.n = 10 self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = [(chr(x), x) for x in range(self.n)] self.assertTrue(1, self.inst_table.search(chr(1)))
def test_hash_lp_search_empty(self): """ This test checks LinearProbe search function on empty table where the key does not exist """ self.n = 10 self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.inst_table = LinearProbe(self.n, h_ascii) self.assertEqual(None, self.inst_table.search(self.key))
def test_hash_lp_full(self): """ This test checks LinearProbe add function on a full table """ self.n = 10 self.value = "Hello" self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = ["Full" for i in range(self.n)] self.assertFalse(self.inst_table.add(self.key, self.value))
def test_hash_lp_search_partial(self): """ This test checks LinearProbe search function on a partially full table where the key does not exist """ self.n = 10 self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = [(chr(x), x) for x in range(self.n)] self.inst_table.table[5] = (chr(15), 15) self.inst_table.table[6] = None self.assertEqual(None, self.inst_table.search(chr(5)))
def testChainedHashSearchChain(self): test_obj = ChainedHash(10, hf.h_ascii) test_obj.add("a", 1) test_obj.add("k", "return this, not 1") self.assertEqual("return this, not 1", test_obj.search("k"))
def testChainedHash(self): test_obj = ChainedHash(10, hf.h_ascii) for i in range(0, 10): test_obj.add("a" + str(i), 1 + i) self.assertEqual(True, test_obj.add("a0", 1))
class TestHash(unittest.TestCase): def make_rand_ascii_hash(self, n): """ This helper function creates a random string of length 10 and calculates its hash value using the h_ascii strategy. """ self.asciiDict = {chr(i): i for i in range(129)} self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.sum = 0 for char in self.rand_string: self.sum += self.asciiDict[char] return self.rand_string, self.sum % n def make_rand_mult_hash(self, n): """ This helper function creates a random string of length 1- and calculates its hash value using the multiplicative hash strategy. """ self.A = (sqrt(5) - 1) / 2 self.asciiDict = {chr(i): i for i in range(129)} self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.sum = 0 for char in self.rand_string: self.sum += self.asciiDict[char] return self.rand_string, floor(self.n * ((self.sum * self.A) % 1)) def make_rand_rolling_hash(self, n): """ This helper function creates a random string of length 10 and calculates its hash value using the poly. rolling hash strategy. """ self.m = 2**64 self.p = 53 self.asciiDict = {chr(i): i for i in range(129)} self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.sum = 0 for c, char in enumerate(self.rand_string): self.sum += self.asciiDict[char] * self.p**c return self.rand_string, (self.sum % self.m) % self.n def test_ascii_hash_empty(self): """ This test checks h_ascii function on an empty string """ self.n = 10 self.assertEqual(0, h_ascii("", self.n)) def test_ascii_hash(self): """ This test checks h_ascii function on a random string of length 10 """ self.n = 10 self.rand_string, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertEqual(self.hash_value, h_ascii(self.rand_string, self.n)) def test_asci_hash_no_table(self): """ This test checks h_ascii function on a random string of length 10 when n = 0 """ self.n = 0 self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.assertRaises(ZeroDivisionError and SystemExit, h_ascii, self.rand_string, self.n) def test_mult_hash_empty(self): """ This test checks h_ascii function on an empty string """ self.n = 10 self.assertEqual(0, h_mult("", self.n)) def test_mult_hash(self): """ This test checks h_ascii function on a random string of length 10 """ self.n = 10 self.rand_string, self.hash_value = self.make_rand_mult_hash(self.n) self.assertEqual(self.hash_value, h_mult(self.rand_string, self.n)) def test_rolling_hash_empty(self): """ This test checks h_rolling function on an empty string """ self.n = 10 self.assertEqual(0, h_rolling("", self.n)) def test_rolling_hash(self): """ This test checks h_rolling function on a random string of length 10 """ self.n = 10 self.rand_string, self.hash_value = self.make_rand_rolling_hash(self.n) self.assertEqual(self.hash_value, h_rolling(self.rand_string, self.n)) def test_rolling_hash_no_table(self): """ This test checks h_rolling function on a random string of length 10 when n = 0 """ self.n = 0 self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.assertRaises(ZeroDivisionError and SystemExit, h_rolling, self.rand_string, self.n) def test_hash_lp_empty(self): """ This test checks LinearProbe add function on a empty table """ self.n = 10 self.value = "Hello" self.inst_table = LinearProbe(self.n, h_ascii) self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value)) def test_hash_lp_full(self): """ This test checks LinearProbe add function on a full table """ self.n = 10 self.value = "Hello" self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = ["Full" for i in range(self.n)] self.assertFalse(self.inst_table.add(self.key, self.value)) def test_hash_lp_search_empty(self): """ This test checks LinearProbe search function on empty table where the key does not exist """ self.n = 10 self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.inst_table = LinearProbe(self.n, h_ascii) self.assertEqual(None, self.inst_table.search(self.key)) def test_hash_lp_search_full(self): """ This test checks LinearProbe search function on a full table where the key exists """ self.n = 10 self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = [(chr(x), x) for x in range(self.n)] self.assertTrue(1, self.inst_table.search(chr(1))) def test_hash_lp_search_partial(self): """ This test checks LinearProbe search function on a partially full table where the key does not exist """ self.n = 10 self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = [(chr(x), x) for x in range(self.n)] self.inst_table.table[5] = (chr(15), 15) self.inst_table.table[6] = None self.assertEqual(None, self.inst_table.search(chr(5))) def test_hash_chain_empty(self): """ This test checks ChainedHash add function on a empty table """ self.n = 10 self.value = "Hello" self.inst_table = ChainedHash(self.n, h_ascii) self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value)) def test_hash_chain_full(self): """ This test checks ChainedHash add function on a table with one entry in each linked list """ self.n = 10 self.value = "Hello" self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.table = [[(chr(x), x)] for x in range(self.n)] self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value)) def test_hash_chain_search_full(self): """ This test checks LinearProbe search function on a full table with multiple elementes in each linked list """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.table = [[(chr(x), x), (chr(x + self.n), x + self.n)] for x in range(self.n)] self.assertEqual(5, self.inst_table.search(chr(5))[0]) def test_hash_chain_search_empty(self): """ This test checks LinearProbe search function on an empty table where the key exists """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.assertEqual(None, self.inst_table.search(chr(5))) def test_boxplot_exist(self): """ This test checks to see if file exists after scatterplot is created """ self.x_data = [x for x in range(10)] self.y_data = self.x_data self.x_label = "X" self.y_label = "Y" self.title = "Title" self.outfile = "test.png" scatter_plot(self.x_data, self.y_data, self.x_label, self.y_label, self.title, self.outfile) self.assertEqual(True, os.path.exists(self.outfile)) def test_keys_exist(self): """ This test checks to see if keys are tracked in the keys list """ self.n = 10 self.key = "TEST" self.value = 1 self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.add(self.key, self.value) self.assertEqual(self.key, self.inst_table.keys[0]) def test_keys_not_exist(self): """ This test checks to see if keys list is empty when initialized """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.assertEqual([], self.inst_table.keys)
def hash_process(gene_reads, sample_attributes, gene, group_types, output_file): """ This function calculates the gene expression distribution across either tissue groups (SMTS) or tissue type (SMTSD) for a target gene. It uses hash tables for O(1) lookups. A series of box plots is generated. Parameters: - gene_reads: (see next line) GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.acmg_59.gct.gz - sample_attributes: GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt - gene: The target gene - group_types: Tissue group or type - output_file: File for saving the box plot (.png) """ sample_id_col_name = 'SAMPID' sample_info_header = None # Initialize tissue-sample hash table h_table_samples = ChainedHash(1000, h_rolling) # Read and preprocess the data lines for line in open(sample_attributes): if sample_info_header is None: sample_info_header = line.rstrip().split('\t') # Find the proper columns containing the group types and sample id's # using linear search group_col_idx = linear_search(group_types, sample_info_header) sample_id_col_idx = linear_search(sample_id_col_name, sample_info_header) # Add samples to hash table else: line = line.rstrip().split('\t') group = line[group_col_idx] sample = line[sample_id_col_idx] h_table_samples.add(key=group, value=sample) version = None dim = None data_header = None gene_name_col = 1 # Initialize sample-count hash table h_table_counts = ChainedHash(100000, h_rolling) # Read and preprocess the gene reads data lines for line in gzip.open(gene_reads, 'rt'): if version is None: version = line continue if dim is None: dim = [int(x) for x in line.rstrip().split()] continue if data_header is None: data_header = line.rstrip().split('\t') continue line = line.rstrip().split('\t') if line[gene_name_col] == gene: gene_row = line for sample, count in zip(data_header, gene_row): h_table_counts.add(sample, count) # Get the counts for each sample of each tissue type group_counts = [] for tissue in h_table_samples.keys: counts = [] for sample in h_table_samples.search(tissue): count = h_table_counts.search(sample) if count is not None: counts.append(int(count[0])) group_counts.append(counts) # Generate box plot boxplot(group_counts, h_table_samples.keys, gene, group_types, "Gene read counts", output_file)