def measure_indexing_times():
    data = {}
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    bin_size = 150
    vcfc_binned_index_times = []
    for _ in range(test_runs):
        vcfc_binned_index_times.append(
            create_binned_index(config, vcfc_filename, bin_size))
    data['vcfc_binned_index'] = {
        'times': vcfc_binned_index_times,
        'mean': np.average(vcfc_binned_index_times),
        'stddev': np.std(vcfc_binned_index_times)
    }

    vcfc_sparse_external_index_times = []
    for _ in range(test_runs):
        vcfc_sparse_external_index_times.append(
            create_vcfc_sparse_external_index(config, vcfc_filename))
    data['vcfc_sparse_external_index'] = {
        'times': vcfc_sparse_external_index_times,
        'mean': np.average(vcfc_sparse_external_index_times),
        'stddev': np.std(vcfc_sparse_external_index_times)
    }

    bgzip_index_times = []
    for _ in range(test_runs):
        bgzip_index_times.append(create_tabix_index(config, bgzip_filename))
    data['bgzip_index'] = {
        'times': bgzip_index_times,
        'mean': np.average(bgzip_index_times),
        'stddev': np.std(bgzip_index_times)
    }

    bcf_index_times = []
    for _ in range(test_runs):
        bcf_index_times.append(create_tabix_index(config, bcf_filename))
    data['bcf_index'] = {
        'times': bcf_index_times,
        'mean': np.average(bcf_index_times),
        'stddev': np.std(bcf_index_times)
    }
    return data
def measure_binned_index_creation_time():
    data = {}
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    bin_sizes = [
        *range(5, 50, 5), *range(50, 200, 25), *range(200, 2000 + 1, 100)
    ]

    creation_times = []
    for bin_size in bin_sizes:
        times = []
        for _ in range(test_runs):
            print('Creating binned index with bin size %d' % bin_size)
            bin_index_creation_time = create_binned_index(
                config, vcfc_filename, bin_size)
            print('Finished creating binned index, took %f seconds' %
                  (bin_index_creation_time))
            times.append(bin_index_creation_time)
        creation_times.append({
            'bin_size': bin_size,
            'time': sum(times) / len(times),
            'stddev': np.std(times)
        })

    data['vcfc_binned_index_creation_time'] = {
        'data': creation_times,
        'label': 'VCFC Binned Index Creation Time'
    }

    return {
        'data': data,
        'title': 'VCFC Binned Index Creation Time by Bin Size',
        'name': 'binned-index-creation-time',
        'xlabel': 'Bin Size',
        'ylabel': 'Time (seconds)'
    }
def measure_binned_index_time_profile_range(query_range: int = 5000,
                                            queries: int = 500):
    data = {}
    assert queries > 0, 'queries > 0'
    # Set step to fit `queries` number of positions into the range(min_pos, max_pos)
    step = int((max_pos - query_range - min_pos) / queries)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    # Override test runs for this, since high # queries per test amortizes runtime
    test_runs = 2

    bin_sizes = [
        *range(5, 50, 5), *range(50, 200, 10), *range(200, 1000 + 1, 50)
    ]

    for bin_size in bin_sizes:
        bin_profile = {}
        print('Creating binned index with bin size %d' % bin_size)
        bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                      bin_size)
        print('Finished creating binned index, took %f seconds' %
              (bin_index_creation_time))

        # Run regular queries, aggregate profile after each
        print('Running %d exhaustive binned queries of size %d')
        # get exactly `queries` loops, account for rounding
        test_count = 0
        for pos in range(min_pos, (min_pos + step * queries) + 1, step):
            test_count += 1
            endpos = pos + query_range
            print('vcfc_binned_timing_profile_range: %d-%d' % (pos, endpos))
            profiles = []
            for _ in range(test_runs):
                profiles.append(
                    run_vcfc_binned_index_timing_profile(
                        config, vcfc_filename, reference_name, pos, endpos))

            # Merge timing profiles
            time_labels = set()
            for p in profiles:
                for k in p:
                    time_labels.add(k)

            for label in time_labels:
                if label not in bin_profile:
                    bin_profile[label] = 0
                label_values = [p[label] for p in profiles if label in p]
                bin_profile[label] += sum(label_values) / len(label_values)

            # for k in profiles[0]:
            #     if k not in bin_profile:
            #         bin_profile[k] = 0
            #     for p in profiles:
            #         bin_profile[k] += p[k] / test_runs

        for k in bin_profile:
            bin_profile[k] /= test_count

        data['vcfc_binned_index_%d' % bin_size] = {
            'data': bin_profile,
            'label': 'Bin Size %d' % bin_size
        }

    return {
        'data': data,
        'title': 'VCFC Binned Index Interval, Query Range %d' % query_range,
        'name': 'binned-timing-profile-range',
        'xlabel': 'Bin Size (not linear scale)',
        'ylabel': 'Time (seconds)'
    }
def measure_binned_index_time_profile(queries: int = 500):
    data = {}
    step = int((max_pos - min_pos) / queries)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    # Override test runs for this, since high # queries per test amortizes runtime
    test_runs = 2

    bin_sizes = [
        *range(5, 50, 5), *range(50, 200, 10), *range(200, 1000 + 1, 50)
    ]

    # Load file into host cache
    os.system('cat %s > /dev/null' % vcfc_filename)

    for bin_size in bin_sizes:
        bin_profile = {}
        print('Creating binned index with bin size %d' % bin_size)
        bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                      bin_size)
        print('Finished creating binned index, took %f seconds' %
              (bin_index_creation_time))

        # Load index into host cache
        os.system('cat %s > /dev/null' %
                  (os.path.join(vcfc_filename, '.vcfci')))

        # Run regular queries, aggregate profile after each
        print('Running exhaustive binned queries')
        test_count = 0
        for pos in range(min_pos, max_pos + 1, step):
            test_count += 1
            print('vcfc_binned_timing_profile: %d' % pos)

            profiles = []
            for _ in range(test_runs):
                profiles.append(
                    run_vcfc_binned_index_timing_profile(
                        config, vcfc_filename, reference_name, pos, pos))

            # Merge timing profiles
            # for k in profiles[0]:
            #     if k not in bin_profile:
            #         bin_profile[k] = 0
            #     for p in profiles:
            #         bin_profile[k] += p[k] / test_runs

            # Merge timing profiles
            time_labels = set()
            for p in profiles:
                for k in p:
                    time_labels.add(k)

            for label in time_labels:
                if label not in bin_profile:
                    bin_profile[label] = 0
                label_values = [p[label] for p in profiles if label in p]
                bin_profile[label] += sum(label_values) / len(label_values)

        for k in bin_profile:
            bin_profile[k] /= test_count

        data['vcfc_binned_index_%d' % bin_size] = {
            'data': bin_profile,
            'label': 'Bin Size %d' % bin_size
        }

    return {
        'data': data,
        'title':
        'VCFC Binned Index Query Phase Time Profile, Single Variant Lookup',
        'name': 'binned-timing-profile-single',
        'xlabel': 'Bin Size (not linear scale)',
        'ylabel': 'Time (seconds)'
    }
def measure_all_single_variant(queries: int = 200):
    data = {}
    step = int((max_pos - min_pos) / queries)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    print('Creating binned index for binned index queries')
    bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                  default_binsize)
    print('Finished creating binned index, took %f seconds' %
          (bin_index_creation_time))

    start_positions = list(range(min_pos, max_pos + 1, step))
    end_positions = start_positions

    # VCFC Sparse External Index
    sparse_external_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-external-exhaustive',
        config, run_vcfc_sparse_external_index_query, vcfc_filename)
    data['vcfc_sparse_external_exhaustive'] = {
        'data': sparse_external_durations,
        'label': 'VCFC Sparse External Index'
    }

    # VCFC Sparse Offset-as-Index
    sparse_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-exhaustive', config,
        run_vcfc_sparse_query, sparse_filename)
    data['vcfc_sparse_exhaustive'] = {
        'data': sparse_durations,
        'label': 'VCFC Sparse Offset-as-Index'
    }

    # VCFC Binned External Index
    create_binned_index(config, vcfc_filename, default_binsize)
    binned_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-binned-external-exhaustive',
        config, run_vcfc_binned_index_query, vcfc_filename)
    data['vcfc_binned_external_exhaustive'] = {
        'data': binned_durations,
        'label': 'VCFC Binned External Index (Bin Size %d)' % default_binsize
    }

    # BGZIP with Tabix Index
    bgzip_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bgzip-tabix-external-exhaustive',
        config, run_tabix, bgzip_filename)
    data['bgzip_tabix_exhaustive'] = {
        'data': bgzip_durations,
        'label': 'BGZIP + Tabix Index'
    }

    # BCF with Tabix Index
    bcf_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bcf-tabix-external-exhaustive',
        config, run_tabix, bcf_filename)
    data['bcf_tabix_exhaustive'] = {
        'data': bcf_durations,
        'label': 'BCF + Tabix Index'
    }

    return {
        'data': data,
        'title': 'Single Variant Query Time',
        'name': 'all-exhaustive-single',
        'xlabel': 'Variant Position',
        'ylabel': 'Time (seconds)'
    }
def measure_all_range_variant(queries: int = 200, query_range: int = 5000):
    data = {}
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)
    # step = int((max_pos - min_pos) / queries)

    # Set step to fit `queries` number of positions into the range(min_pos, max_pos)
    step = int((max_pos - query_range - min_pos) / queries)
    print('Step: %s' % step)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    print('Creating binned index for binned index queries')
    bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                  default_binsize)
    print('Finished creating binned index, took %f seconds' %
          (bin_index_creation_time))

    # for pos in range(min_pos, (min_pos+step*queries)+1, step):
    # end_pos = pos + query_range

    start_positions = list(range(min_pos, (min_pos + step * queries) + 1,
                                 step))
    end_positions = [s + query_range for s in start_positions]

    # VCFC Sparse External Index
    sparse_external_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-external-exhaustive',
        config, run_vcfc_sparse_external_index_query, vcfc_filename)
    data['vcfc_sparse_external_exhaustive'] = {
        'data': sparse_external_durations,
        'label': 'VCFC Sparse External Index'
    }

    # VCFC Sparse Offset-as-Index
    sparse_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-exhaustive', config,
        run_vcfc_sparse_query, sparse_filename)
    data['vcfc_sparse_exhaustive'] = {
        'data': sparse_durations,
        'label': 'VCFC Sparse Offset-as-Index'
    }

    # VCFC Binned External Index
    create_binned_index(config, vcfc_filename, default_binsize)
    binned_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-binned-external-exhaustive',
        config, run_vcfc_binned_index_query, vcfc_filename)
    data['vcfc_binned_external_exhaustive'] = {
        'data': binned_durations,
        'label': 'VCFC Binned External Index (Bin Size %d)' % default_binsize
    }

    # BGZIP with Tabix Index
    bgzip_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bgzip-tabix-external-exhaustive',
        config, run_tabix, bgzip_filename)
    data['bgzip_tabix_exhaustive'] = {
        'data': bgzip_durations,
        'label': 'BGZIP + Tabix Index'
    }

    # BCF with Tabix Index
    bcf_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bcf-tabix-external-exhaustive',
        config, run_tabix, bcf_filename)
    data['bcf_tabix_exhaustive'] = {
        'data': bcf_durations,
        'label': 'BCF + Tabix Index'
    }

    return {
        'data': data,
        'title': 'Variant Range Query Time (Query Range %d)' % query_range,
        'name': 'all-exhaustive-range',
        'xlabel': 'Range Start Position',
        'ylabel': 'Time (seconds)'
    }
Example #7
0
 def __init__(self, model_configuration, exp_path):
     self.model_config = Config(model_configuration)
     self.exp_path = exp_path