class TestHash(unittest.TestCase): def make_rand_ascii_hash(self, n): """ This helper function creates a random string of length 10 and calculates its hash value using the h_ascii strategy. """ self.asciiDict = {chr(i): i for i in range(129)} self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.sum = 0 for char in self.rand_string: self.sum += self.asciiDict[char] return self.rand_string, self.sum % n def make_rand_mult_hash(self, n): """ This helper function creates a random string of length 1- and calculates its hash value using the multiplicative hash strategy. """ self.A = (sqrt(5) - 1) / 2 self.asciiDict = {chr(i): i for i in range(129)} self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.sum = 0 for char in self.rand_string: self.sum += self.asciiDict[char] return self.rand_string, floor(self.n * ((self.sum * self.A) % 1)) def make_rand_rolling_hash(self, n): """ This helper function creates a random string of length 10 and calculates its hash value using the poly. rolling hash strategy. """ self.m = 2**64 self.p = 53 self.asciiDict = {chr(i): i for i in range(129)} self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.sum = 0 for c, char in enumerate(self.rand_string): self.sum += self.asciiDict[char] * self.p**c return self.rand_string, (self.sum % self.m) % self.n def test_ascii_hash_empty(self): """ This test checks h_ascii function on an empty string """ self.n = 10 self.assertEqual(0, h_ascii("", self.n)) def test_ascii_hash(self): """ This test checks h_ascii function on a random string of length 10 """ self.n = 10 self.rand_string, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertEqual(self.hash_value, h_ascii(self.rand_string, self.n)) def test_asci_hash_no_table(self): """ This test checks h_ascii function on a random string of length 10 when n = 0 """ self.n = 0 self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.assertRaises(ZeroDivisionError and SystemExit, h_ascii, self.rand_string, self.n) def test_mult_hash_empty(self): """ This test checks h_ascii function on an empty string """ self.n = 10 self.assertEqual(0, h_mult("", self.n)) def test_mult_hash(self): """ This test checks h_ascii function on a random string of length 10 """ self.n = 10 self.rand_string, self.hash_value = self.make_rand_mult_hash(self.n) self.assertEqual(self.hash_value, h_mult(self.rand_string, self.n)) def test_rolling_hash_empty(self): """ This test checks h_rolling function on an empty string """ self.n = 10 self.assertEqual(0, h_rolling("", self.n)) def test_rolling_hash(self): """ This test checks h_rolling function on a random string of length 10 """ self.n = 10 self.rand_string, self.hash_value = self.make_rand_rolling_hash(self.n) self.assertEqual(self.hash_value, h_rolling(self.rand_string, self.n)) def test_rolling_hash_no_table(self): """ This test checks h_rolling function on a random string of length 10 when n = 0 """ self.n = 0 self.rand_string = ''.join(random.choices(string.ascii_letters, k=10)) self.assertRaises(ZeroDivisionError and SystemExit, h_rolling, self.rand_string, self.n) def test_hash_lp_empty(self): """ This test checks LinearProbe add function on a empty table """ self.n = 10 self.value = "Hello" self.inst_table = LinearProbe(self.n, h_ascii) self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value)) def test_hash_lp_full(self): """ This test checks LinearProbe add function on a full table """ self.n = 10 self.value = "Hello" self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = ["Full" for i in range(self.n)] self.assertFalse(self.inst_table.add(self.key, self.value)) def test_hash_lp_search_empty(self): """ This test checks LinearProbe search function on empty table where the key does not exist """ self.n = 10 self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.inst_table = LinearProbe(self.n, h_ascii) self.assertEqual(None, self.inst_table.search(self.key)) def test_hash_lp_search_full(self): """ This test checks LinearProbe search function on a full table where the key exists """ self.n = 10 self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = [(chr(x), x) for x in range(self.n)] self.assertTrue(1, self.inst_table.search(chr(1))) def test_hash_lp_search_partial(self): """ This test checks LinearProbe search function on a partially full table where the key does not exist """ self.n = 10 self.inst_table = LinearProbe(self.n, h_ascii) self.inst_table.table = [(chr(x), x) for x in range(self.n)] self.inst_table.table[5] = (chr(15), 15) self.inst_table.table[6] = None self.assertEqual(None, self.inst_table.search(chr(5))) def test_hash_chain_empty(self): """ This test checks ChainedHash add function on a empty table """ self.n = 10 self.value = "Hello" self.inst_table = ChainedHash(self.n, h_ascii) self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value)) def test_hash_chain_full(self): """ This test checks ChainedHash add function on a table with one entry in each linked list """ self.n = 10 self.value = "Hello" self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.table = [[(chr(x), x)] for x in range(self.n)] self.key, self.hash_value = self.make_rand_ascii_hash(self.n) self.assertTrue(self.inst_table.add(self.key, self.value)) def test_hash_chain_search_full(self): """ This test checks LinearProbe search function on a full table with multiple elementes in each linked list """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.table = [[(chr(x), x), (chr(x + self.n), x + self.n)] for x in range(self.n)] self.assertEqual(5, self.inst_table.search(chr(5))[0]) def test_hash_chain_search_empty(self): """ This test checks LinearProbe search function on an empty table where the key exists """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.assertEqual(None, self.inst_table.search(chr(5))) def test_boxplot_exist(self): """ This test checks to see if file exists after scatterplot is created """ self.x_data = [x for x in range(10)] self.y_data = self.x_data self.x_label = "X" self.y_label = "Y" self.title = "Title" self.outfile = "test.png" scatter_plot(self.x_data, self.y_data, self.x_label, self.y_label, self.title, self.outfile) self.assertEqual(True, os.path.exists(self.outfile)) def test_keys_exist(self): """ This test checks to see if keys are tracked in the keys list """ self.n = 10 self.key = "TEST" self.value = 1 self.inst_table = ChainedHash(self.n, h_ascii) self.inst_table.add(self.key, self.value) self.assertEqual(self.key, self.inst_table.keys[0]) def test_keys_not_exist(self): """ This test checks to see if keys list is empty when initialized """ self.n = 10 self.inst_table = ChainedHash(self.n, h_ascii) self.assertEqual([], self.inst_table.keys)
def testChainedHashSearchChain(self): test_obj = ChainedHash(10, hf.h_ascii) test_obj.add("a", 1) test_obj.add("k", "return this, not 1") self.assertEqual("return this, not 1", test_obj.search("k"))
def hash_process(gene_reads, sample_attributes, gene, group_types, output_file): """ This function calculates the gene expression distribution across either tissue groups (SMTS) or tissue type (SMTSD) for a target gene. It uses hash tables for O(1) lookups. A series of box plots is generated. Parameters: - gene_reads: (see next line) GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.acmg_59.gct.gz - sample_attributes: GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt - gene: The target gene - group_types: Tissue group or type - output_file: File for saving the box plot (.png) """ sample_id_col_name = 'SAMPID' sample_info_header = None # Initialize tissue-sample hash table h_table_samples = ChainedHash(1000, h_rolling) # Read and preprocess the data lines for line in open(sample_attributes): if sample_info_header is None: sample_info_header = line.rstrip().split('\t') # Find the proper columns containing the group types and sample id's # using linear search group_col_idx = linear_search(group_types, sample_info_header) sample_id_col_idx = linear_search(sample_id_col_name, sample_info_header) # Add samples to hash table else: line = line.rstrip().split('\t') group = line[group_col_idx] sample = line[sample_id_col_idx] h_table_samples.add(key=group, value=sample) version = None dim = None data_header = None gene_name_col = 1 # Initialize sample-count hash table h_table_counts = ChainedHash(100000, h_rolling) # Read and preprocess the gene reads data lines for line in gzip.open(gene_reads, 'rt'): if version is None: version = line continue if dim is None: dim = [int(x) for x in line.rstrip().split()] continue if data_header is None: data_header = line.rstrip().split('\t') continue line = line.rstrip().split('\t') if line[gene_name_col] == gene: gene_row = line for sample, count in zip(data_header, gene_row): h_table_counts.add(sample, count) # Get the counts for each sample of each tissue type group_counts = [] for tissue in h_table_samples.keys: counts = [] for sample in h_table_samples.search(tissue): count = h_table_counts.search(sample) if count is not None: counts.append(int(count[0])) group_counts.append(counts) # Generate box plot boxplot(group_counts, h_table_samples.keys, gene, group_types, "Gene read counts", output_file)