Exemple #1
0
def run_vcfc_binned_index_timing_profile(config: Config, filename: str,
                                         ref: str, start: int,
                                         end: int) -> dict:
    if not os.path.exists(filename):
        raise RuntimeError('%s does not exist' % filename)
    if not os.path.exists(filename + '.vcfci'):
        raise RuntimeError('%s does not exist' % (filename + '.vcfci'))

    cmd_args = [
        config.get_vcfc_timing_cmd(), 'query-binned-index', filename,
        '%s:%d-%d' % (ref, start, end)
    ]

    # Load file into host cache
    # os.system('cat %s > /dev/null' % filename)

    # flush_cache()

    # I think writing to a file is more reliable than using subprocess.PIPE
    out_file = tempfile.NamedTemporaryFile(delete=False)
    err_file = tempfile.NamedTemporaryFile(delete=False)

    # with tempfile.TemporaryFile('w+b') as out_file, tempfile.TemporaryFile('w+b') as err_file:

    start_time = time.time()
    proc = subprocess.Popen(cmd_args,
                            stdin=None,
                            stdout=out_file,
                            stderr=err_file)
    proc.wait()  # cannot use this with stdout=subprocess.PIPE
    # stdout, stderr = proc.communicate()
    end_time = time.time()
    duration = round(end_time - start_time, 6)
    # print('duration: %.9f' % (duration))

    out_file.close()
    err_file.close()

    out_file = open(out_file.name, 'r')
    err_file = open(err_file.name, 'r')

    stdout = out_file.read()  #.decode('utf-8')
    stderr = err_file.read()  #.decode('utf-8')
    # print('stdout: %s' % stdout)

    out_file.close()
    err_file.close()

    os.unlink(out_file.name)
    os.unlink(err_file.name)

    if proc.returncode != 0 or len(stderr) > 0:
        raise RuntimeError('cmd: %s failed with status %d:\n%s' %
                           (cmd_args, proc.returncode, stderr))

    # print('Constructing profile')
    profile = construct_timing_profile(stdout)
    return profile
Exemple #2
0
def run_vcfc_sparse_query(config: Config, filename: str, ref: str, start: int,
                          end: int) -> float:
    if not os.path.exists(filename):
        raise RuntimeError('%s does not exist' % filename)
    cmd_args = [
        config.get_vcfc_release_cmd(), 'sparse-query', filename,
        '%s:%d-%d' % (ref, start, end)
    ]
    return time_cmd(cmd_args)
Exemple #3
0
def create_vcfc_sparse_external_index(config: Config, filename: str) -> float:
    if not os.path.exists(filename):
        raise RuntimeError('%s does not exist' % filename)
    cmd_args = [
        config.get_vcfc_release_cmd(),
        'create-sparse-index',
        filename,
        # '%s:%d-%d' % (ref, start, end)
    ]
    return time_cmd(cmd_args)
Exemple #4
0
def run_vcfc_binned_index_query(config: Config, filename: str, ref: str,
                                start: int, end: int) -> float:
    if not os.path.exists(filename):
        raise RuntimeError('%s does not exist' % filename)
    if not os.path.exists(filename + '.vcfci'):
        raise RuntimeError('%s does not exist' % (filename + '.vcfci'))

    cmd_args = [
        config.get_vcfc_release_cmd(), 'query-binned-index', filename,
        '%s:%d-%d' % (ref, start, end)
    ]
    return time_cmd(cmd_args)
def measure_indexing_times():
    data = {}
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    bin_size = 150
    vcfc_binned_index_times = []
    for _ in range(test_runs):
        vcfc_binned_index_times.append(
            create_binned_index(config, vcfc_filename, bin_size))
    data['vcfc_binned_index'] = {
        'times': vcfc_binned_index_times,
        'mean': np.average(vcfc_binned_index_times),
        'stddev': np.std(vcfc_binned_index_times)
    }

    vcfc_sparse_external_index_times = []
    for _ in range(test_runs):
        vcfc_sparse_external_index_times.append(
            create_vcfc_sparse_external_index(config, vcfc_filename))
    data['vcfc_sparse_external_index'] = {
        'times': vcfc_sparse_external_index_times,
        'mean': np.average(vcfc_sparse_external_index_times),
        'stddev': np.std(vcfc_sparse_external_index_times)
    }

    bgzip_index_times = []
    for _ in range(test_runs):
        bgzip_index_times.append(create_tabix_index(config, bgzip_filename))
    data['bgzip_index'] = {
        'times': bgzip_index_times,
        'mean': np.average(bgzip_index_times),
        'stddev': np.std(bgzip_index_times)
    }

    bcf_index_times = []
    for _ in range(test_runs):
        bcf_index_times.append(create_tabix_index(config, bcf_filename))
    data['bcf_index'] = {
        'times': bcf_index_times,
        'mean': np.average(bcf_index_times),
        'stddev': np.std(bcf_index_times)
    }
    return data
def measure_binned_index_creation_time():
    data = {}
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    bin_sizes = [
        *range(5, 50, 5), *range(50, 200, 25), *range(200, 2000 + 1, 100)
    ]

    creation_times = []
    for bin_size in bin_sizes:
        times = []
        for _ in range(test_runs):
            print('Creating binned index with bin size %d' % bin_size)
            bin_index_creation_time = create_binned_index(
                config, vcfc_filename, bin_size)
            print('Finished creating binned index, took %f seconds' %
                  (bin_index_creation_time))
            times.append(bin_index_creation_time)
        creation_times.append({
            'bin_size': bin_size,
            'time': sum(times) / len(times),
            'stddev': np.std(times)
        })

    data['vcfc_binned_index_creation_time'] = {
        'data': creation_times,
        'label': 'VCFC Binned Index Creation Time'
    }

    return {
        'data': data,
        'title': 'VCFC Binned Index Creation Time by Bin Size',
        'name': 'binned-index-creation-time',
        'xlabel': 'Bin Size',
        'ylabel': 'Time (seconds)'
    }
Exemple #7
0
def create_binned_index(config: Config, filename: str, bin_size: int) -> float:
    cmd_args = [
        config.get_vcfc_release_cmd(), 'create-binned-index',
        str(bin_size), filename
    ]
    return time_cmd(cmd_args)
def measure_binned_index_time_profile_range(query_range: int = 5000,
                                            queries: int = 500):
    data = {}
    assert queries > 0, 'queries > 0'
    # Set step to fit `queries` number of positions into the range(min_pos, max_pos)
    step = int((max_pos - query_range - min_pos) / queries)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    # Override test runs for this, since high # queries per test amortizes runtime
    test_runs = 2

    bin_sizes = [
        *range(5, 50, 5), *range(50, 200, 10), *range(200, 1000 + 1, 50)
    ]

    for bin_size in bin_sizes:
        bin_profile = {}
        print('Creating binned index with bin size %d' % bin_size)
        bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                      bin_size)
        print('Finished creating binned index, took %f seconds' %
              (bin_index_creation_time))

        # Run regular queries, aggregate profile after each
        print('Running %d exhaustive binned queries of size %d')
        # get exactly `queries` loops, account for rounding
        test_count = 0
        for pos in range(min_pos, (min_pos + step * queries) + 1, step):
            test_count += 1
            endpos = pos + query_range
            print('vcfc_binned_timing_profile_range: %d-%d' % (pos, endpos))
            profiles = []
            for _ in range(test_runs):
                profiles.append(
                    run_vcfc_binned_index_timing_profile(
                        config, vcfc_filename, reference_name, pos, endpos))

            # Merge timing profiles
            time_labels = set()
            for p in profiles:
                for k in p:
                    time_labels.add(k)

            for label in time_labels:
                if label not in bin_profile:
                    bin_profile[label] = 0
                label_values = [p[label] for p in profiles if label in p]
                bin_profile[label] += sum(label_values) / len(label_values)

            # for k in profiles[0]:
            #     if k not in bin_profile:
            #         bin_profile[k] = 0
            #     for p in profiles:
            #         bin_profile[k] += p[k] / test_runs

        for k in bin_profile:
            bin_profile[k] /= test_count

        data['vcfc_binned_index_%d' % bin_size] = {
            'data': bin_profile,
            'label': 'Bin Size %d' % bin_size
        }

    return {
        'data': data,
        'title': 'VCFC Binned Index Interval, Query Range %d' % query_range,
        'name': 'binned-timing-profile-range',
        'xlabel': 'Bin Size (not linear scale)',
        'ylabel': 'Time (seconds)'
    }
def measure_binned_index_time_profile(queries: int = 500):
    data = {}
    step = int((max_pos - min_pos) / queries)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    # Override test runs for this, since high # queries per test amortizes runtime
    test_runs = 2

    bin_sizes = [
        *range(5, 50, 5), *range(50, 200, 10), *range(200, 1000 + 1, 50)
    ]

    # Load file into host cache
    os.system('cat %s > /dev/null' % vcfc_filename)

    for bin_size in bin_sizes:
        bin_profile = {}
        print('Creating binned index with bin size %d' % bin_size)
        bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                      bin_size)
        print('Finished creating binned index, took %f seconds' %
              (bin_index_creation_time))

        # Load index into host cache
        os.system('cat %s > /dev/null' %
                  (os.path.join(vcfc_filename, '.vcfci')))

        # Run regular queries, aggregate profile after each
        print('Running exhaustive binned queries')
        test_count = 0
        for pos in range(min_pos, max_pos + 1, step):
            test_count += 1
            print('vcfc_binned_timing_profile: %d' % pos)

            profiles = []
            for _ in range(test_runs):
                profiles.append(
                    run_vcfc_binned_index_timing_profile(
                        config, vcfc_filename, reference_name, pos, pos))

            # Merge timing profiles
            # for k in profiles[0]:
            #     if k not in bin_profile:
            #         bin_profile[k] = 0
            #     for p in profiles:
            #         bin_profile[k] += p[k] / test_runs

            # Merge timing profiles
            time_labels = set()
            for p in profiles:
                for k in p:
                    time_labels.add(k)

            for label in time_labels:
                if label not in bin_profile:
                    bin_profile[label] = 0
                label_values = [p[label] for p in profiles if label in p]
                bin_profile[label] += sum(label_values) / len(label_values)

        for k in bin_profile:
            bin_profile[k] /= test_count

        data['vcfc_binned_index_%d' % bin_size] = {
            'data': bin_profile,
            'label': 'Bin Size %d' % bin_size
        }

    return {
        'data': data,
        'title':
        'VCFC Binned Index Query Phase Time Profile, Single Variant Lookup',
        'name': 'binned-timing-profile-single',
        'xlabel': 'Bin Size (not linear scale)',
        'ylabel': 'Time (seconds)'
    }
def measure_all_single_variant(queries: int = 200):
    data = {}
    step = int((max_pos - min_pos) / queries)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    print('Creating binned index for binned index queries')
    bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                  default_binsize)
    print('Finished creating binned index, took %f seconds' %
          (bin_index_creation_time))

    start_positions = list(range(min_pos, max_pos + 1, step))
    end_positions = start_positions

    # VCFC Sparse External Index
    sparse_external_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-external-exhaustive',
        config, run_vcfc_sparse_external_index_query, vcfc_filename)
    data['vcfc_sparse_external_exhaustive'] = {
        'data': sparse_external_durations,
        'label': 'VCFC Sparse External Index'
    }

    # VCFC Sparse Offset-as-Index
    sparse_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-exhaustive', config,
        run_vcfc_sparse_query, sparse_filename)
    data['vcfc_sparse_exhaustive'] = {
        'data': sparse_durations,
        'label': 'VCFC Sparse Offset-as-Index'
    }

    # VCFC Binned External Index
    create_binned_index(config, vcfc_filename, default_binsize)
    binned_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-binned-external-exhaustive',
        config, run_vcfc_binned_index_query, vcfc_filename)
    data['vcfc_binned_external_exhaustive'] = {
        'data': binned_durations,
        'label': 'VCFC Binned External Index (Bin Size %d)' % default_binsize
    }

    # BGZIP with Tabix Index
    bgzip_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bgzip-tabix-external-exhaustive',
        config, run_tabix, bgzip_filename)
    data['bgzip_tabix_exhaustive'] = {
        'data': bgzip_durations,
        'label': 'BGZIP + Tabix Index'
    }

    # BCF with Tabix Index
    bcf_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bcf-tabix-external-exhaustive',
        config, run_tabix, bcf_filename)
    data['bcf_tabix_exhaustive'] = {
        'data': bcf_durations,
        'label': 'BCF + Tabix Index'
    }

    return {
        'data': data,
        'title': 'Single Variant Query Time',
        'name': 'all-exhaustive-single',
        'xlabel': 'Variant Position',
        'ylabel': 'Time (seconds)'
    }
def measure_all_range_variant(queries: int = 200, query_range: int = 5000):
    data = {}
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)
    # step = int((max_pos - min_pos) / queries)

    # Set step to fit `queries` number of positions into the range(min_pos, max_pos)
    step = int((max_pos - query_range - min_pos) / queries)
    print('Step: %s' % step)
    config = Config(tabix_cmd, vcfc_dir, bgzip_cmd)

    print('Creating binned index for binned index queries')
    bin_index_creation_time = create_binned_index(config, vcfc_filename,
                                                  default_binsize)
    print('Finished creating binned index, took %f seconds' %
          (bin_index_creation_time))

    # for pos in range(min_pos, (min_pos+step*queries)+1, step):
    # end_pos = pos + query_range

    start_positions = list(range(min_pos, (min_pos + step * queries) + 1,
                                 step))
    end_positions = [s + query_range for s in start_positions]

    # VCFC Sparse External Index
    sparse_external_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-external-exhaustive',
        config, run_vcfc_sparse_external_index_query, vcfc_filename)
    data['vcfc_sparse_external_exhaustive'] = {
        'data': sparse_external_durations,
        'label': 'VCFC Sparse External Index'
    }

    # VCFC Sparse Offset-as-Index
    sparse_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-sparse-exhaustive', config,
        run_vcfc_sparse_query, sparse_filename)
    data['vcfc_sparse_exhaustive'] = {
        'data': sparse_durations,
        'label': 'VCFC Sparse Offset-as-Index'
    }

    # VCFC Binned External Index
    create_binned_index(config, vcfc_filename, default_binsize)
    binned_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'vcfc-binned-external-exhaustive',
        config, run_vcfc_binned_index_query, vcfc_filename)
    data['vcfc_binned_external_exhaustive'] = {
        'data': binned_durations,
        'label': 'VCFC Binned External Index (Bin Size %d)' % default_binsize
    }

    # BGZIP with Tabix Index
    bgzip_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bgzip-tabix-external-exhaustive',
        config, run_tabix, bgzip_filename)
    data['bgzip_tabix_exhaustive'] = {
        'data': bgzip_durations,
        'label': 'BGZIP + Tabix Index'
    }

    # BCF with Tabix Index
    bcf_durations = _run_variant_query_exhaustive(
        start_positions, end_positions, 'bcf-tabix-external-exhaustive',
        config, run_tabix, bcf_filename)
    data['bcf_tabix_exhaustive'] = {
        'data': bcf_durations,
        'label': 'BCF + Tabix Index'
    }

    return {
        'data': data,
        'title': 'Variant Range Query Time (Query Range %d)' % query_range,
        'name': 'all-exhaustive-range',
        'xlabel': 'Range Start Position',
        'ylabel': 'Time (seconds)'
    }
Exemple #12
0
 def __init__(self, model_configuration, exp_path):
     self.model_config = Config(model_configuration)
     self.exp_path = exp_path