def measure_indexing_times(): data = {} config = Config(tabix_cmd, vcfc_dir, bgzip_cmd) bin_size = 150 vcfc_binned_index_times = [] for _ in range(test_runs): vcfc_binned_index_times.append( create_binned_index(config, vcfc_filename, bin_size)) data['vcfc_binned_index'] = { 'times': vcfc_binned_index_times, 'mean': np.average(vcfc_binned_index_times), 'stddev': np.std(vcfc_binned_index_times) } vcfc_sparse_external_index_times = [] for _ in range(test_runs): vcfc_sparse_external_index_times.append( create_vcfc_sparse_external_index(config, vcfc_filename)) data['vcfc_sparse_external_index'] = { 'times': vcfc_sparse_external_index_times, 'mean': np.average(vcfc_sparse_external_index_times), 'stddev': np.std(vcfc_sparse_external_index_times) } bgzip_index_times = [] for _ in range(test_runs): bgzip_index_times.append(create_tabix_index(config, bgzip_filename)) data['bgzip_index'] = { 'times': bgzip_index_times, 'mean': np.average(bgzip_index_times), 'stddev': np.std(bgzip_index_times) } bcf_index_times = [] for _ in range(test_runs): bcf_index_times.append(create_tabix_index(config, bcf_filename)) data['bcf_index'] = { 'times': bcf_index_times, 'mean': np.average(bcf_index_times), 'stddev': np.std(bcf_index_times) } return data
def measure_binned_index_creation_time(): data = {} config = Config(tabix_cmd, vcfc_dir, bgzip_cmd) bin_sizes = [ *range(5, 50, 5), *range(50, 200, 25), *range(200, 2000 + 1, 100) ] creation_times = [] for bin_size in bin_sizes: times = [] for _ in range(test_runs): print('Creating binned index with bin size %d' % bin_size) bin_index_creation_time = create_binned_index( config, vcfc_filename, bin_size) print('Finished creating binned index, took %f seconds' % (bin_index_creation_time)) times.append(bin_index_creation_time) creation_times.append({ 'bin_size': bin_size, 'time': sum(times) / len(times), 'stddev': np.std(times) }) data['vcfc_binned_index_creation_time'] = { 'data': creation_times, 'label': 'VCFC Binned Index Creation Time' } return { 'data': data, 'title': 'VCFC Binned Index Creation Time by Bin Size', 'name': 'binned-index-creation-time', 'xlabel': 'Bin Size', 'ylabel': 'Time (seconds)' }
def measure_binned_index_time_profile_range(query_range: int = 5000, queries: int = 500): data = {} assert queries > 0, 'queries > 0' # Set step to fit `queries` number of positions into the range(min_pos, max_pos) step = int((max_pos - query_range - min_pos) / queries) config = Config(tabix_cmd, vcfc_dir, bgzip_cmd) # Override test runs for this, since high # queries per test amortizes runtime test_runs = 2 bin_sizes = [ *range(5, 50, 5), *range(50, 200, 10), *range(200, 1000 + 1, 50) ] for bin_size in bin_sizes: bin_profile = {} print('Creating binned index with bin size %d' % bin_size) bin_index_creation_time = create_binned_index(config, vcfc_filename, bin_size) print('Finished creating binned index, took %f seconds' % (bin_index_creation_time)) # Run regular queries, aggregate profile after each print('Running %d exhaustive binned queries of size %d') # get exactly `queries` loops, account for rounding test_count = 0 for pos in range(min_pos, (min_pos + step * queries) + 1, step): test_count += 1 endpos = pos + query_range print('vcfc_binned_timing_profile_range: %d-%d' % (pos, endpos)) profiles = [] for _ in range(test_runs): profiles.append( run_vcfc_binned_index_timing_profile( config, vcfc_filename, reference_name, pos, endpos)) # Merge timing profiles time_labels = set() for p in profiles: for k in p: time_labels.add(k) for label in time_labels: if label not in bin_profile: bin_profile[label] = 0 label_values = [p[label] for p in profiles if label in p] bin_profile[label] += sum(label_values) / len(label_values) # for k in profiles[0]: # if k not in bin_profile: # bin_profile[k] = 0 # for p in profiles: # bin_profile[k] += p[k] / test_runs for k in bin_profile: bin_profile[k] /= test_count data['vcfc_binned_index_%d' % bin_size] = { 'data': bin_profile, 'label': 'Bin Size %d' % bin_size } return { 'data': data, 'title': 'VCFC Binned Index Interval, Query Range %d' % query_range, 'name': 'binned-timing-profile-range', 'xlabel': 'Bin Size (not linear scale)', 'ylabel': 'Time (seconds)' }
def measure_binned_index_time_profile(queries: int = 500): data = {} step = int((max_pos - min_pos) / queries) config = Config(tabix_cmd, vcfc_dir, bgzip_cmd) # Override test runs for this, since high # queries per test amortizes runtime test_runs = 2 bin_sizes = [ *range(5, 50, 5), *range(50, 200, 10), *range(200, 1000 + 1, 50) ] # Load file into host cache os.system('cat %s > /dev/null' % vcfc_filename) for bin_size in bin_sizes: bin_profile = {} print('Creating binned index with bin size %d' % bin_size) bin_index_creation_time = create_binned_index(config, vcfc_filename, bin_size) print('Finished creating binned index, took %f seconds' % (bin_index_creation_time)) # Load index into host cache os.system('cat %s > /dev/null' % (os.path.join(vcfc_filename, '.vcfci'))) # Run regular queries, aggregate profile after each print('Running exhaustive binned queries') test_count = 0 for pos in range(min_pos, max_pos + 1, step): test_count += 1 print('vcfc_binned_timing_profile: %d' % pos) profiles = [] for _ in range(test_runs): profiles.append( run_vcfc_binned_index_timing_profile( config, vcfc_filename, reference_name, pos, pos)) # Merge timing profiles # for k in profiles[0]: # if k not in bin_profile: # bin_profile[k] = 0 # for p in profiles: # bin_profile[k] += p[k] / test_runs # Merge timing profiles time_labels = set() for p in profiles: for k in p: time_labels.add(k) for label in time_labels: if label not in bin_profile: bin_profile[label] = 0 label_values = [p[label] for p in profiles if label in p] bin_profile[label] += sum(label_values) / len(label_values) for k in bin_profile: bin_profile[k] /= test_count data['vcfc_binned_index_%d' % bin_size] = { 'data': bin_profile, 'label': 'Bin Size %d' % bin_size } return { 'data': data, 'title': 'VCFC Binned Index Query Phase Time Profile, Single Variant Lookup', 'name': 'binned-timing-profile-single', 'xlabel': 'Bin Size (not linear scale)', 'ylabel': 'Time (seconds)' }
def measure_all_single_variant(queries: int = 200): data = {} step = int((max_pos - min_pos) / queries) config = Config(tabix_cmd, vcfc_dir, bgzip_cmd) print('Creating binned index for binned index queries') bin_index_creation_time = create_binned_index(config, vcfc_filename, default_binsize) print('Finished creating binned index, took %f seconds' % (bin_index_creation_time)) start_positions = list(range(min_pos, max_pos + 1, step)) end_positions = start_positions # VCFC Sparse External Index sparse_external_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'vcfc-sparse-external-exhaustive', config, run_vcfc_sparse_external_index_query, vcfc_filename) data['vcfc_sparse_external_exhaustive'] = { 'data': sparse_external_durations, 'label': 'VCFC Sparse External Index' } # VCFC Sparse Offset-as-Index sparse_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'vcfc-sparse-exhaustive', config, run_vcfc_sparse_query, sparse_filename) data['vcfc_sparse_exhaustive'] = { 'data': sparse_durations, 'label': 'VCFC Sparse Offset-as-Index' } # VCFC Binned External Index create_binned_index(config, vcfc_filename, default_binsize) binned_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'vcfc-binned-external-exhaustive', config, run_vcfc_binned_index_query, vcfc_filename) data['vcfc_binned_external_exhaustive'] = { 'data': binned_durations, 'label': 'VCFC Binned External Index (Bin Size %d)' % default_binsize } # BGZIP with Tabix Index bgzip_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'bgzip-tabix-external-exhaustive', config, run_tabix, bgzip_filename) data['bgzip_tabix_exhaustive'] = { 'data': bgzip_durations, 'label': 'BGZIP + Tabix Index' } # BCF with Tabix Index bcf_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'bcf-tabix-external-exhaustive', config, run_tabix, bcf_filename) data['bcf_tabix_exhaustive'] = { 'data': bcf_durations, 'label': 'BCF + Tabix Index' } return { 'data': data, 'title': 'Single Variant Query Time', 'name': 'all-exhaustive-single', 'xlabel': 'Variant Position', 'ylabel': 'Time (seconds)' }
def measure_all_range_variant(queries: int = 200, query_range: int = 5000): data = {} config = Config(tabix_cmd, vcfc_dir, bgzip_cmd) # step = int((max_pos - min_pos) / queries) # Set step to fit `queries` number of positions into the range(min_pos, max_pos) step = int((max_pos - query_range - min_pos) / queries) print('Step: %s' % step) config = Config(tabix_cmd, vcfc_dir, bgzip_cmd) print('Creating binned index for binned index queries') bin_index_creation_time = create_binned_index(config, vcfc_filename, default_binsize) print('Finished creating binned index, took %f seconds' % (bin_index_creation_time)) # for pos in range(min_pos, (min_pos+step*queries)+1, step): # end_pos = pos + query_range start_positions = list(range(min_pos, (min_pos + step * queries) + 1, step)) end_positions = [s + query_range for s in start_positions] # VCFC Sparse External Index sparse_external_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'vcfc-sparse-external-exhaustive', config, run_vcfc_sparse_external_index_query, vcfc_filename) data['vcfc_sparse_external_exhaustive'] = { 'data': sparse_external_durations, 'label': 'VCFC Sparse External Index' } # VCFC Sparse Offset-as-Index sparse_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'vcfc-sparse-exhaustive', config, run_vcfc_sparse_query, sparse_filename) data['vcfc_sparse_exhaustive'] = { 'data': sparse_durations, 'label': 'VCFC Sparse Offset-as-Index' } # VCFC Binned External Index create_binned_index(config, vcfc_filename, default_binsize) binned_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'vcfc-binned-external-exhaustive', config, run_vcfc_binned_index_query, vcfc_filename) data['vcfc_binned_external_exhaustive'] = { 'data': binned_durations, 'label': 'VCFC Binned External Index (Bin Size %d)' % default_binsize } # BGZIP with Tabix Index bgzip_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'bgzip-tabix-external-exhaustive', config, run_tabix, bgzip_filename) data['bgzip_tabix_exhaustive'] = { 'data': bgzip_durations, 'label': 'BGZIP + Tabix Index' } # BCF with Tabix Index bcf_durations = _run_variant_query_exhaustive( start_positions, end_positions, 'bcf-tabix-external-exhaustive', config, run_tabix, bcf_filename) data['bcf_tabix_exhaustive'] = { 'data': bcf_durations, 'label': 'BCF + Tabix Index' } return { 'data': data, 'title': 'Variant Range Query Time (Query Range %d)' % query_range, 'name': 'all-exhaustive-range', 'xlabel': 'Range Start Position', 'ylabel': 'Time (seconds)' }
def __init__(self, model_configuration, exp_path): self.model_config = Config(model_configuration) self.exp_path = exp_path