def test_chained_hash_rolling_collision(self): for i in range(100): test_length = rdm.randint(2, 1000) test_value1 = rdm.randint(1, 1000) test_value2 = rdm.randint(1, 1000) test_key = 'teststring' test_table = ht.ChainedHash(test_length, hf.h_rolling) test_table.add(test_key, test_value1) test_table.add(test_key, test_value2) self.assertEqual(test_value1, test_table.search(test_key)) self.assertEqual((test_key, test_value2), test_table.T[hf.h_rolling(test_key, test_length)][1])
def test_chained_hash_ascii_basic(self): size = 10000 hash_table = hash_tables.ChainedHash(size, hf.h_ascii) entries = {} for i in range(int(size / 2)): key = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) value = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) entries[key] = value self.assertTrue(hash_table.add(key, value)) for k, v in entries.items(): self.assertEqual(hash_table.search(k), v)
def test_chained_hash_nonexistent_key(self): size = 100 hash_table = hash_tables.ChainedHash(size, hf.h_ascii) entries = {} for i in range(int(size / 2)): key = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) value = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) entries[key] = value self.assertTrue(hash_table.add(key, value)) self.assertEqual( hash_table.search( "This is a key that is very unlikely to be generated"), None)
def test_chained_hash_rolling_variable_key_store(self): letters = string.ascii_lowercase + string.ascii_uppercase for i in range(10): keys = [] test_length = rdm.randint(1, 100) test_table = ht.ChainedHash(test_length, hf.h_rolling) for k in range(50): test_value = rdm.randint test_key = '' for j in range(rdm.randint(1, 100)): letter = rdm.choice(letters) test_key += letter keys.append(test_key) test_table.add(test_key, test_value) self.assertEqual(keys, test_table.keys)
def test_chained_hash_rolling_variable_add_search(self): for i in range(100): test_length = rdm.randint(1, 100) letters = string.ascii_lowercase + string.ascii_uppercase test_value = rdm.randint test_key = '' for j in range(rdm.randint(1, 100)): letter = rdm.choice(letters) test_key += letter test_table = ht.ChainedHash(test_length, hf.h_rolling) test_table.add(test_key, test_value) self.assertEqual((test_key, test_value), test_table.T[hf.h_rolling(test_key, test_length)][0]) self.assertEqual(test_value, test_table.search(test_key))
def test_chainedhash_h_ascii_multiple_elements(self): tablesize = 1000 table = ht.ChainedHash(tablesize, ht.h_ascii) tabledict = {} for i in range(0, 500): randkey = "" randomval = random.randint(0, 100) for i in range(0, random.randint(1, 50)): randkey += chr(random.randint(32, 126)) if randkey in tabledict: continue else: if table.add(randkey, randomval) == -1: break else: tabledict[randkey] = randomval table.add(randkey, randomval) for key in tabledict: self.assertEqual(tabledict[key], table.search(key))
def test_add_function(self): test = ht.ChainedHash(50, hf.h_ascii) test.add('text', 'value') self.assertEqual(test.T[3][0][1], 'value')
def test_chained_hash_search_key_none(self): test_table = ht.ChainedHash(5, hf.h_ascii) self.assertEqual(None, test_table.search(None))
def main(): arguments = parse_arguments() data_file_name = arguments.data_file sample_info_file_name = arguments.sample_file group_col_name = arguments.sample_type sample_id_col_name = 'SAMPID' gene_name = arguments.gene sample_info_header, samples = parse_sample_file(sample_info_file_name) key = linear_search(group_col_name, sample_info_header) table = hash_tables.ChainedHash(50, hash_functions.h_rolling) keys = [] for i in samples: result = table.search(i[key]) if result is None: table.add(i[key], [i[0]]) keys.append(i[key]) else: loc = table.search_loc(i[key]) table.T[loc][0][1].append(i[0]) version = None dim = None data_header = None gene_name_col = 1 table_2 = hash_tables.ChainedHash(10000, hash_functions.h_rolling) for l in gzip.open(data_file_name, 'rt'): if version == None: version = l continue if dim == None: dim = [int(x) for x in l.rstrip().split()] continue if data_header == None: data_header = [] i = 0 for field in l.rstrip().split('\t'): data_header.append([field, i]) i += 1 data_header.sort(key=lambda tup: tup[0]) continue A = l.rstrip().split('\t') if A[gene_name_col] == 'BRCA2': for header, gene_data in zip(data_header[2:], A[2:]): table_2.add(header[0], gene_data) group_counts = [[] for _ in range(len(keys))] for i in range(len(keys)): for val in table.search(keys[i]): result = table_2.search(val) if result is not None: group_counts[i].append(int(result)) dv.boxplot(group_counts, keys, ylabel='Gene Read Counts', xlabel=arguments.sample_type, title=arguments.gene, out_file_name=arguments.output_filename)
def testChainedHash_search_not_in_table_ascii(self): test = hash_tables.ChainedHash(10, hash_functions.h_ascii) self.assertFalse(test.search('key'))
def test_chained_hash_key_not_in_table(self): table = hash_tables.ChainedHash(hash_functions.h_ascii, 30) assert table.search('not in table') == -1
def test_chained_hash_add_empty(self): table = hash_tables.ChainedHash(hash_functions.h_ascii, 100) assert(table.insert('woah!', 1) is True) assert('woah!' in table.keys)
def test_search_bad_value(self): test = ht.ChainedHash(50, hf.h_ascii) test.add('text', 'value') self.assertEqual(test.search('nothere'), None)
def main(): parser = argparse.ArgumentParser( description='find tissue counts for specific gene', prog='bay') parser.add_argument('--gene_reads', type=str, help='GTEX gene counts', required=True) parser.add_argument('--sample', type=str, help='GTEX samples file', required=True) parser.add_argument('--group_type', type=str, help='group: either SMTS or SMTSD', required=True) parser.add_argument('--gene', type=str, help='gene name', required=True) parser.add_argument('--output_file', type=str, help='desired output file name', required=True) args = parser.parse_args() version = None dim = None count_headers = None for l in open(args.gene_reads, 'rt'): if version is None: v = l continue if dim is None: dim = l continue if count_headers is None: count_headers = l.rstrip.split('\t') for i in range(len(count_headers)): cch.append([count_headers[i], i]) continue counts = l.rstrip().split('\t') desc = linear_search('Description', count_headers) if counts[desc] == args.gene: to_return = [] chainedhash = ht.ChainedHash(1000000, hf.h_rolling) for i in range(desc + 1, len(count_headers)): chainedhash.add(count_headers[i], int(counts[i])) for t in group: list_counts = [] location = table.search(t) if location is None: continue for s in location: count = chainedhash.search(s) if count is None: continue list_counts.append(count) to_return.append(list_counts) dv.boxplot(to_return, args.output_file, 'x', 'y', 'title', groups=group)
def main(): # data_file_name='GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.acmg_59.gct.gz' data_file_name = args.gene_reads # sample_info_file_name='GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt' sample_info_file_name = args.sample_attributes # samples_to_count_map = get_samples_to_count_map # group_col_name = 'SMTS' group_col_name = args.group_type if (group_col_name != 'SMTS') and (group_col_name != 'SMTSD'): print('--group_type must be either SMTS or SMTSD') sys.exit(1) sample_id_col_name = 'SAMPID' # gene_name = 'ACTA2' gene_name = args.gene samples = [] sample_info_header = None try: for l in open(sample_info_file_name): if sample_info_header is None: sample_info_header = l.rstrip().split('\t') else: samples.append(l.rstrip().split('\t')) except FileNotFoundError: print('--sample_attributes could not be found') sys.exit(1) try: group_col_idx = linear_search(group_col_name, sample_info_header) sample_id_col_idx = linear_search(sample_id_col_name, sample_info_header) except TypeError: print('--sample_attributes is not formatted properly,' + ' check that it is not empty') sys.exit(1) tissue_to_samples_map = ht.ChainedHash(1000000, hf.h_rolling) groups = [] for s in samples: key = s[group_col_idx] value = s[sample_id_col_idx] in_group = linear_search(key, groups) if in_group == -1: groups.append(key) hit = tissue_to_samples_map.search(key) if hit is None: tissue_to_samples_map.add(key, []) hit = tissue_to_samples_map.search(key) hit.append(value) version = None dim = None data_header = None gene_name_col = 1 group_counts = [[] for i in range(len(groups))] gene_hits = 0 samples_to_count_map = ht.ChainedHash(1000000, hf.h_rolling) try: for l in gzip.open(data_file_name, 'rt'): if version is None: version = l continue if dim is None: dim = [int(x) for x in l.rstrip().split()] continue if data_header is None: i = 0 for field in l.rstrip().split('\t'): samples_to_count_map.add(field, i) i += 1 data_header = 1 continue A = l.rstrip().split('\t') if A[gene_name_col] == gene_name: gene_hits += 1 for group_idx in range(len(groups)): members = tissue_to_samples_map.search(groups[group_idx]) for member in members: member_idx = samples_to_count_map.search(member) if member_idx is not None: group_counts[group_idx].append(int(A[member_idx])) break except OSError: print('--gene_reads must be a gzipped file') sys.exit(1) except Exception: print('There was a problem with --gene_reads') sys.exit(1) if gene_hits == 0: print('Gene could not be found in given data') sys.exit(1) try: data_viz.boxplot(group_counts, args.output_file, gene_name, group_col_name, 'Gene read counts', groups) except SystemExit: print('--output_file already exists, please choose a different name') sys.exit(1) except ValueError: print('--output_file is of unsupported type, try a .png') sys.exit(1)
def main(): parser = argparse.ArgumentParser(description='Store key' 'data structures', prog='insert_key_value_pairs') parser.add_argument('--datastructure', type=str, help='Name of ' "datastructure to use. Choose from 'hash', " "'binary_tree', or 'avl_tree'", required=True) parser.add_argument('--dataset', type=str, help='Name of txt file' ', value pairs', required=True) parser.add_argument('--number_keys', type=int, help='Number of keys from' 'dataset to read in', required=True) args = parser.parse_args() datastructure = args.datastructure filename = args.dataset N = args.number_keys if datastructure == 'hash': print('initializing') hashtable = ht.ChainedHash(10000000, ht.hash_functions.h_rolling) insert_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') hashtable.add(data[0], data[1]) counter += 1 if counter == N: break insert_t1 = time.time() search_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') hashtable.search(data[0]) counter += 1 if counter == N: break search_t1 = time.time() print('time to insert: ' + str(insert_t1 - insert_t0)) print('time to search: ' + str(search_t1 - search_t0)) elif datastructure == 'binary_tree': print('initialize binary tree') insert_t0 = time.time() datatree = binary_tree.create_tree(filename, N) insert_t1 = time.time() search_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') binary_tree.search(datatree, data[0]) counter += 1 if counter == N: break search_t1 = time.time() print('time to insert: ' + str(insert_t1 - insert_t0)) print('time to search: ' + str(search_t1 - search_t0)) elif datastructure == 'avl_tree': print('initialize AVL tree') insert_t0 = time.time() datatree = avl_tree.create_AVLtree(filename, N) insert_t1 = time.time() search_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') avl_tree.search(datatree, data[0]) counter += 1 if counter == N: break search_t1 = time.time() print('time to insert: ' + str(insert_t1 - insert_t0)) print('time to search: ' + str(search_t1 - search_t0)) else: print('does not recognize ')
def main(): """Creates box plots of gene expression data from GTEx analysis Parameters ----------- --gene_reads_file : A GTEx_Analysis file ending in '.gct.gz'. Contains measured gene expression level by tissue type. Input as a string. ex. 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.acmg_59.gct.gz' --sample_input_file : A txt file containing sample identification information, corresponding to data in the .gz file. Input as a string. ex. 'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt' --group_data_by : Data is displayed by tissue, and can be sorted by tissue groups (SMTS) or tissue types (SMTSD). Input either 'SMTS' or 'SMTSD' as a string. --target_gene : The gene of interest to plot. Input as a string. A full list of available genes can be found here: https://github.com/swe4s/ lectures/blob/master/data_integration/gtex/acmg_genes.txt --output_file_name: File name to save the output plot. Input as a string with the extension .png. Returns -------- Function returns a box plot of expression data, saved in the local directory as output_file_name. """ parser = argparse.ArgumentParser(description='plot gene expression data ' 'from gtex files', prog='plot_gtex.py') parser.add_argument('--gene_reads_file', type=str, help='Name of gene' 'count input file', required=True) parser.add_argument('--sample_info_file', type=str, help='Name of sample' 'info input file', required=True) parser.add_argument('--group_data_by', type=str, help='Select either' 'tissue groups (SMTS) or tissue types (SMTSD)', required=True) parser.add_argument('--target_gene', type=str, help='Gene of interest to ' 'plot', required=True) parser.add_argument('--output_file_name', type=str, help='Name for saved' 'output graph', required=True) args = parser.parse_args() try: # file with gene read counts for each sample data_file_name = args.gene_reads_file # file with informational headers for each sample sample_info_file_name = args.sample_info_file except FileNotFoundError: print('Could not find input data file') sys.exit(1) except PermissionError: print('Could not open input data file') sys.exit(1) # plot gene expression of tissue groups (SMTS) or tissue types (SMTSD) # choice stored in variable 'tissue_selection' group_col_name = args.group_data_by # gene of interest to plot gene_name = args.target_gene sample_to_count_map = ht.ChainedHash(1000000, ht.hash_functions.h_rolling) version = None dim = None data_header = None gene_name_col = 1 for l in gzip.open(data_file_name, 'rt'): if version == None: version = l continue if dim == None: dim = [int(x) for x in l.rstrip().split()] continue if data_header == None: data_header = l.rstrip().split('\t') continue # remove first and second items from list (not sample ids) data_header.pop(0) data_header.pop(0) sample_ids = data_header A = l.rstrip().split('\t') print(A[gene_name_col]) for sample_i in range(len(sample_ids)): if A[gene_name_col] == gene_name: sample_to_count_map.add(sample_ids[sample_i], A[sample_i]) samples_to_tissues_map = ht.ChainedHash(1000000, ht.hash_functions.h_rolling) # in new hash table, SAMPID is from column 0, SMTS column 5, SMTSD column tissues_list = [] for l in open(sample_info_file_name): line_split = l.rstrip().split('\t') if group_col_name == 'SMTS': if line_split[5] not in tissues_list: tissues_list.append(line_split[5]) samples_to_tissues_map.add(line_split[0], line_split[5]) if group_col_name == 'SMTSD': if line_split[5] not in tissues_list: tissues_list.append(line_split[5]) samples_to_tissues_map.add(line_split[0], line_split[6]) tissues_list.pop(0) # remove SMTS or SMTSD from list group_counts = [] for tissue in tissues_list: counts = [] for sample in sample_ids: if samples_to_tissues_map.search(sample) == tissue: counts.append(int(sample_to_count_map.search(sample))) group_counts.append(counts) # ploting with data_viz.py module # code will output a list of lists containing gene data for tissue type, # and a list of names corresponding to each list of data to box plot saved_plot_name = args.output_file_name title = str(gene_name) x_label = group_col_name y_label = "Gene read counts" data = group_counts x_ticks = tissues_list data_viz.boxplot(saved_plot_name, title, x_label, y_label, data, x_ticks)
def main(): """ test data structures for storing key, value pairs Arguments --------- --datastructure: the datastructure to build storing desired key, value pairs. Choose from 'hash', 'binary_tree', or 'avl_tree'. --dataset: a tab-separated txt file containing lines of key, value pairs to store --number_keys: the number of keys from dataset to read in Returns ------- The specified data structure containing all key, value pairs. Also prints the elapsed time to insert all keys and elapsed time to search for all keys. """ parser = argparse.ArgumentParser(description='Store key, value pairs in ' 'data structures', prog='insert_key_value_pairs') parser.add_argument('--datastructure', type=str, help='Name of ' "datastructure to use. Choose from 'hash', " "'binary_tree', or 'avl_tree'", required=True) parser.add_argument('--dataset', type=str, help='Name of txt file with key' ', value pairs', required=True) parser.add_argument('--number_keys', type=int, help='Number of keys from' 'dataset to read in', required=True) args = parser.parse_args() datastructure = args.datastructure filename = args.dataset N = args.number_keys if datastructure == 'hash': # call hash tables submodule print('initialize hash table') hashtable = ht.ChainedHash(10000000, ht.hash_functions.h_rolling) # measure time to insert all keys in file insert_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') hashtable.add(data[0], data[1]) counter += 1 if counter == N: break insert_t1 = time.time() # measure time to search for all keys search_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') hashtable.search(data[0]) counter += 1 if counter == N: break search_t1 = time.time() print('time to insert: ' + str(insert_t1 - insert_t0)) print('time to search: ' + str(search_t1 - search_t0)) elif datastructure == 'binary_tree': # call binary_tree tree function print('initialize binary tree') # measure time to insert all keys in file insert_t0 = time.time() datatree = binary_tree.create_tree(filename, N) insert_t1 = time.time() # measure time to search for keys search_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') binary_tree.search(datatree, data[0]) counter += 1 if counter == N: break search_t1 = time.time() print('time to insert: ' + str(insert_t1 - insert_t0)) print('time to search: ' + str(search_t1 - search_t0)) elif datastructure == 'avl_tree': # call avl_tree tree function print('initialize AVL tree') # measure time to insert all keys in file insert_t0 = time.time() datatree = avl_tree.create_AVLtree(filename, N) insert_t1 = time.time() # measure time to search for keys search_t0 = time.time() counter = 0 for line in open(filename, 'r'): data = line.rstrip().split('\t') avl_tree.search(datatree, data[0]) counter += 1 if counter == N: break search_t1 = time.time() print('time to insert: ' + str(insert_t1 - insert_t0)) print('time to search: ' + str(search_t1 - search_t0)) else: print('does not recognize datastructure name')
def main(): """main function""" data_file_name = args.gzfile sample_info_file_name = args.txtfile group_col_name = args.group_type gene_name = args.gene sample_id_col_name = 'SAMPID' samples = [] sample_info_header = None for l in open(sample_info_file_name): if sample_info_header is None: sample_info_header = l.rstrip().split('\t') else: samples.append(l.rstrip().split('\t')) group_col_idx = linear_search(group_col_name, sample_info_header) sample_id_col_idx = linear_search(sample_id_col_name, sample_info_header) groups = [] members = [] names = [] MemTable = ht.ChainedHash(35, hf.h_rolling) for row_idx in range(len(samples)): sample = samples[row_idx] sample_name = sample[sample_id_col_idx] curr_group = sample[group_col_idx] names = names + [curr_group] curr_group_idx = linear_search(curr_group, groups) if curr_group_idx == -1: curr_group_idx = len(groups) groups.append(curr_group) members.append([]) members[curr_group_idx].append(sample_name) MemTable.add(curr_group, sample_name) nameset = list(dict.fromkeys(names).keys()) version = None dim = None data_header = None gene_name_col = 1 Table1 = ht.ChainedHash(len(nameset), hf.h_rolling) group_counts = [[] for i in range(len(groups))] for l in gzip.open(data_file_name, 'rt'): if version is None: version = l continue if dim is None: dim = [int(x) for x in l.rstrip().split()] continue if data_header is None: data_header = [] i = 0 for field in l.rstrip().split('\t'): data_header.append([field, i]) i += 1 data_header.sort(key=lambda tup: tup[0]) continue A = l.rstrip().split('\t') if A[gene_name_col] == gene_name: for group_idx in range(len(groups)): ii = 0.0 jj = 0.0 for member in members[group_idx]: t00_binary = time.time() member_idx = binary_search(member, data_header) t01_binary = time.time() ii = ii + 1 if member_idx != -1: jj = jj + 1 t0_hash = time.time() Table1.add(groups[group_idx], int(A[member_idx])) t1_hash = time.time() break binarytime = t01_binary - t00_binary print("Binary Time") print(binarytime * ii) hashtime = t1_hash - t0_hash print("Hash Time") print(hashtime * jj) group_counts = [[] for i in range(len(groups))] i = 0 for key in np.unique(Table1.keys): group_counts[i].append(Table1.search(key)) i = i + 1 g = data_viz.boxplot(group_counts, sorted(nameset), group_col_name, gene_name, args.outfile)
def test_search_function(self): test = ht.ChainedHash(50, hf.h_ascii) test.add('text', 'value') self.assertEqual(test.search('text'), 'value')
def test_no_overwrite(self): test = ht.ChainedHash(50, hf.h_ascii) test.add('text', 'value') test.add('text', 'newvalue') self.assertEqual(test.T[3][0][1], 'value') self.assertEqual(test.T[3][1][1], 'newvalue')
def testChainedHash_search_in_table_python(self): test = hash_tables.ChainedHash(10, hash_functions.h_python) for i in range(5): test.add(str(i), 2 * i) self.assertEqual(test.search('3'), 6)
def test_search_two_values_one_key(self): test = ht.ChainedHash(5, hf.h_ascii) test.add('text', 'value') test.add('blet', 'newvalue') self.assertEqual(test.search('text'), 'value') self.assertEqual(test.search('blet'), 'newvalue')
def testChainedHash_add_to_empty_ascii(self): x = random.randint(0, 100) y = hash_functions.h_ascii test = hash_tables.ChainedHash(x, y) self.assertTrue(test.add('key', 10))
def test_chained_hash_search_1(self): table = hash_tables.ChainedHash(hash_functions.h_ascii, 100) table.insert('woah!', 1) assert(table.search('woah!') == 1)
def test_chained_hash_bad_fxn(self): self.assertRaises(TypeError, lambda: ht.ChainedHash(5, None)) self.assertRaises(TypeError, lambda: ht.ChainedHash(5, 'string')) self.assertRaises(TypeError, lambda: ht.ChainedHash(5, int(5))) self.assertRaises(TypeError, lambda: ht.ChainedHash(5, float(420.69)))
def test_chained_hash_replace_key(self): table = hash_tables.ChainedHash(hash_functions.h_ascii, 30) table.insert('ayo', 10) table.insert('ayo', 100) assert table.capacity == 1 assert table.search('ayo') == 100
def test_chained_hash_add_key_none(self): test_table = ht.ChainedHash(5, hf.h_ascii) self.assertEqual(None, test_table.add(None, 420))
def main(): args = initialize() # check if the input files exist if (not os.path.exists(args.sample_attributes)): print('Metadata file not found') sys.exit(1) if (not os.path.exists(args.gene_reads)): print('Gene data file not found') sys.exit(1) target_gene_name = args.gene metadata_header = None if args.data_structure == 'parallel': samples, target_group = [], [] # only for parallel array elif args.data_structure == 'hash': target_group = [] ht_meta = hash_tables.ChainedHash(100000, hash_functions.h_rolling) else: print('Please input data structures available.') print('Options available include "parallel" and "hash".') sys.exit(1) for l in open(args.sample_attributes): sample_info = l.rstrip().split('\t') if metadata_header is None: metadata_header = sample_info continue sample_idx = linear_search('SAMPID', metadata_header) target_idx = linear_search(args.group_type, metadata_header) if (target_idx == -1): break # no such group if args.data_structure == 'parallel': samples.append(sample_info[sample_idx]) # ID target_group.append(sample_info[target_idx]) # group type elif args.data_structure == 'hash': key = sample_info[target_idx] # group type value = sample_info[sample_idx] # ID search = ht_meta.search(key) if search is None: ht_meta.add(key, [value]) # map ID and group target_group.append(key) else: search.append(value) if len(target_group) == 0: print('Group type not found') sys.exit(1) version, dim, rna_header = None, None, None for l in gzip.open(args.gene_reads, 'rt'): if version is None: version = l continue if dim is None: dim = l continue if rna_header is None: rna_header = l.rstrip().split('\t') rna_header_plus_index = [] for i in range(len(rna_header)): rna_header_plus_index.append([rna_header[i], i]) rna_header_plus_index.sort() continue rna_counts = l.rstrip().split('\t') description_idx = linear_search('Description', rna_header) if description_idx == -1: print('No genes found in the header') sys.exit(1) if rna_counts[description_idx] == target_gene_name: if args.data_structure == 'parallel': attrs = list(set(target_group)) attrs.sort() par_array = [] # search_start = time.time() for attr in attrs: attr_idxs = linear_search_all_hits(attr, target_group) attr_counts = [] for attr_idx in attr_idxs: rna_header_idx = linear_search(samples[attr_idx], rna_header) # rna_header_idx = binary_search(samples[attr_idx], # rna_header_plus_index) if rna_header_idx == -1: continue count = rna_counts[rna_header_idx] attr_counts.append(int(count)) par_array.append(attr_counts) data_viz.boxplot(par_array, target_group, args.group_type, 'Gene read counts', target_gene_name, args.output_file) # search_end = time.time() # print(search_end - search_start) sys.exit(0) elif args.data_structure == 'hash': counts_list = [] ht_rna = hash_tables.ChainedHash( 100000, hash_functions.h_rolling) for i in range(description_idx + 1, len(rna_header)): # map ID and counts ht_rna.add(rna_header[i], int(rna_counts[i])) target_group.sort() for attr in target_group: attr_counts = [] sampID = ht_meta.search(attr) if sampID is None: continue for ID in sampID: count = ht_rna.search(ID) if count is None: continue attr_counts.append(count) counts_list.append(attr_counts) data_viz.boxplot(counts_list, target_group, args.group_type, 'Gene read counts', target_gene_name, args.output_file) sys.exit(0) sys.exit(0)
def main(): args = initialize() if args.number_pairs <= 1 or args.number_pairs > 10000: print('The number of key/value pairs should be in the range of 2 to \ 10000.') sys.exit(1) if not os.path.exists(args.dataset): print('Input dataset not found.') sys.exit(1) else: f = open(args.dataset, 'r') lines = f.readlines() f.close() t_insert, t_search, t_search_non = [], [], [] # just for plotting if args.data_structure == 'hash' or args.data_structure == 'all': print('\nResults of the hash table') print('=========================') # key insertion table = hash_tables.ChainedHash(10 * int(args.number_pairs), hash_functions.h_rolling) i = 0 # number of pairs taken in / line number key_list = [] start = time.time() for l in lines: key = l.split(' ')[0] value = l.split(' ')[1] key_list.append(key) if i < args.number_pairs: table.add(key, value) i += 1 else: break end = time.time() t_insert.append(end - start) print( 'It requires %8.7f seconds to insert %s keys to the hash table.' % ((end - start), args.number_pairs)) # searching existing keys start = time.time() for key in key_list: table.search(key) end = time.time() t_search.append(end - start) print('It requires %8.7f seconds to search for all the %s keys inerted\ just now in the hash table.' % ((end - start), args.number_pairs)) # searching non-existing keys start = time.time() for key in key_list: table.search(key + '_non') end = time.time() t_search_non.append(end - start) print('It requires %8.7f seconds to search for %s non-existing keys in\ the hash table.\n' % ((end - start), args.number_pairs)) if args.data_structure == 'AVL' or args.data_structure == 'all': print('Results of the AVL tree') print('=======================') # key insertion avl_tree = avl.AVLTree() i = 0 # number of pairs taken in / line number key_list = [] start = time.time() for l in lines: key = l.split(' ')[0] value = l.split(' ')[1] key_list.append(key) if i < args.number_pairs: avl_tree.insert(key, value) i += 1 end = time.time() t_insert.append(end - start) print('It requires %8.7f seconds to insert %s keys to the AVL tree.' % ((end - start), args.number_pairs)) # searching existing keys start = time.time() for key in key_list: avl_tree.search(key) end = time.time() t_search.append(end - start) print('It requires %8.7f seconds to search for all the %s keys inerted\ just now in the AVL tree.' % ((end - start), args.number_pairs)) # searching non-existing keys start = time.time() for key in key_list: avl_tree.search(key + '_non') end = time.time() t_search_non.append(end - start) print('It requires %8.7f seconds to search for %s non-existing keys in\ the AVL tree.\n' % ((end - start), args.number_pairs)) if args.data_structure == 'tree' or args.data_structure == 'all': print('Results of the binary tree') print('==========================') # key insertion i = 0 # number of pairs taken in / line number key_list = [] start = time.time() for l in lines: key = l.split(' ')[0] value = l.split(' ')[1] key_list.append(key) if i < args.number_pairs: if i == 0: root = bt.Node(key, value) i += 1 else: bt.insert(root, key, value) i += 1 else: break end = time.time() t_insert.append(end - start) print( 'It requires %8.7f seconds to insert %s keys to the binary tree.' % ((end - start), args.number_pairs)) # searching existing keys start = time.time() for key in key_list: bt.search(root, key) end = time.time() t_search.append(end - start) print('It requires %8.7f seconds to search for all the %s keys inerted\ just now in the binary tree.' % ((end - start), args.number_pairs)) # searching non-existing keys start = time.time() for key in key_list: bt.search(root, key + '_non') end = time.time() t_search_non.append(end - start) print('It requires %8.7f seconds to search for %s non-existing keys in\ the binary tree.\n' % ((end - start), args.number_pairs)) # Plot a bar chart if "all" is selected if args.data_structure == 'all': rc( 'font', **{ 'family': 'sans-serif', 'sans-serif': ['DejaVu Sans'], 'size': 10 }) # Set the font used for MathJax - more on this later rc('mathtext', **{'default': 'regular'}) plt.rc('font', family='serif') n_groups = 3 # 3 different data structures fig, ax = plt.subplots() index = np.arange(n_groups) bar_width = 0.25 data1 = plt.bar(index, t_insert, bar_width, alpha=0.8, label='Insertion') data2 = plt.bar(index + bar_width, t_search, bar_width, alpha=0.8, label='Searching\n existing keys') data3 = plt.bar(index + 2 * bar_width, t_search_non, bar_width, alpha=0.8, label='Searching\n non-existing keys') if 'rand' in args.dataset: keyword = args.dataset.split('.')[0] + 'om' else: keyword = args.dataset.split('.')[0] plt.title('Manipulation of %s %s key-value pairs' % (args.number_pairs, keyword), weight='semibold') plt.xlabel('Data structures', weight='semibold') plt.ylabel('Time required (s)', weight='semibold') plt.xticks(index + bar_width, ('Hash table', 'AVL tree', 'Binary tree')) plt.legend() plt.tight_layout() plt.grid(True) plt.savefig('Benchmark_%s_%s.png' % (keyword, args.number_pairs)) plt.show()