Example #1
0
def test_path_with_affine():
    readset, var_pos, clustering, genotypes = create_testinstance1()
    ploidy = 3

    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)
    positions = get_cluster_start_end_positions(readset, clustering, index)
    coverage = get_coverage(readset, clustering, index)
    cov_map = get_pos_to_clusters_map(coverage, ploidy)
    consensus = get_local_cluster_consensus(readset, clustering, cov_map, positions)

    path = compute_threading_path(
        readset, clustering, num_vars, coverage, cov_map, consensus, ploidy, genotypes
    )
    cluster_paths = ["".join([str(path[i][j]) for i in range(len(path))]) for j in range(3)]

    first_block = set([cluster_paths[0][:9], cluster_paths[1][:9], cluster_paths[2][:9]])
    first_truth = set(["000000000", "111111111", "044444444"])
    second_block = set([cluster_paths[0][9:20], cluster_paths[1][9:20], cluster_paths[2][9:20]])
    second_truth = set(["33333333333", "22222222222", "44444555555"])
    third_block = set([cluster_paths[0][20:], cluster_paths[1][20:], cluster_paths[2][20:]])
    third_truth = set(["66", "77", "55"])

    print(cluster_paths)

    assert first_block == first_truth
    assert second_block == second_truth
    assert third_block == third_truth
Example #2
0
def find_inconsistencies(readset, clustering, ploidy):
    # Returns the number of cluster positions with inconsistencies
    # (counts position multiple times, if multiple clusters are inconsistent there)
    # Also returns a list of read pairs, which need to be seperated
    num_inconsistent_positions = 0
    separated_pairs = []
    exp_error = 0.05
    p_val_threshold = 0.02

    # Compute consensus and coverage
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)

    coverage = get_coverage(readset, clustering, index)
    cov_map = get_pos_to_clusters_map(coverage, ploidy)
    positions = get_cluster_start_end_positions(readset, clustering, index)
    abs_coverage = get_coverage_absolute(readset, clustering, index)
    consensus = get_local_cluster_consensus_withfrac(readset, clustering,
                                                     cov_map, positions)

    # Search for positions in clusters with ambivalent consensus
    for pos in range(num_vars):
        # print(str(pos)+" -> "+str(len(coverage[pos]))+" , "+str(len(consensus[pos])))
        for c_id in coverage[pos]:
            if c_id not in consensus[pos]:
                continue
            # do binomial hypothesis test, whether the deviations from majority allele is significant enough for splitting
            abs_count = abs_coverage[pos][c_id]
            abs_deviations = int(abs_count * (1 - consensus[pos][c_id][1]))
            p_val = binom_test(abs_deviations,
                               abs_count,
                               exp_error,
                               alternative="greater")
            if p_val < p_val_threshold:
                # print("   inconsistency in cluster "+str(c_id)+" at position"+str(pos)+" with coverage "+str(coverage[pos][c_id])+" and consensus "+str(consensus[pos][c_id]))
                num_inconsistent_positions += 1
                zero_reads = []
                one_reads = []
                for read in clustering[c_id]:
                    for var in readset[read]:
                        if index[var.position] == pos:
                            if var.allele == 0:
                                zero_reads.append(read)
                            else:
                                one_reads.append(read)
                for r0 in zero_reads:
                    for r1 in one_reads:
                        separated_pairs.append((r0, r1))

    return num_inconsistent_positions, separated_pairs
Example #3
0
def test_clusterbased_structures():
    readset, var_pos, clustering, _ = create_testinstance1()
    index, rev_index = get_position_map(readset)

    # clustering bounds
    cluster_start_ends = get_cluster_start_end_positions(readset, clustering, index)
    assert cluster_start_ends[0] == (0, 11)
    assert cluster_start_ends[1] == (0, 9)
    assert cluster_start_ends[2] == (7, 19)
    assert cluster_start_ends[3] == (8, 19)
    assert cluster_start_ends[4] == (1, 13)
    assert cluster_start_ends[5] == (14, 21)
    assert cluster_start_ends[6] == (16, 21)
    assert cluster_start_ends[7] == (16, 21)
Example #4
0
def draw_plots(
    block_readsets,
    clustering,
    threading,
    haplotypes,
    cut_positions,
    genotype_list_multi,
    phasable_variant_table,
    plot_clusters,
    plot_threading,
    output,
):
    # Plot options
    logger.info("Generating plots ...")
    combined_readset = ReadSet()
    for block_readset in block_readsets:
        for read in block_readset:
            combined_readset.add(read)
    if plot_clusters:
        draw_clustering(
            combined_readset,
            clustering,
            phasable_variant_table,
            output + ".clusters.pdf",
            genome_space=False,
        )
    if plot_threading:
        index, rev_index = get_position_map(combined_readset)
        coverage = get_coverage(combined_readset, clustering, index)
        draw_threading(
            combined_readset,
            clustering,
            coverage,
            threading,
            cut_positions,
            haplotypes,
            phasable_variant_table,
            genotype_list_multi,
            output + ".threading.pdf",
        )
Example #5
0
def phase_single_individual(readset, phasable_variant_table, sample,
                            phasing_param, output, timers):

    # Compute the genotypes that belong to the variant table and create a list of all genotypes
    genotype_list = create_genotype_list(phasable_variant_table, sample)

    # Select reads, only for debug
    # selected_reads = select_reads(readset, 120, preferred_source_ids = vcf_source_ids)
    # readset = selected_reads

    # Precompute block borders based on read coverage and linkage between variants
    logger.info("Detecting connected components with weak interconnect ..")
    timers.start("detecting_blocks")
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)
    if phasing_param.block_cut_sensitivity == 0:
        block_starts = [0]
    elif phasing_param.block_cut_sensitivity == 1:
        block_starts = compute_linkage_based_block_starts(readset,
                                                          index,
                                                          phasing_param.ploidy,
                                                          single_linkage=True)
    else:
        block_starts = compute_linkage_based_block_starts(readset,
                                                          index,
                                                          phasing_param.ploidy,
                                                          single_linkage=False)

    # Set block borders and split readset
    ext_block_starts = block_starts + [num_vars]
    num_non_singleton_blocks = len([
        i for i in range(len(block_starts))
        if ext_block_starts[i] < ext_block_starts[i + 1] - 1
    ])
    logger.info(
        "Split heterozygous variants into {} blocks (and {} singleton blocks)."
        .format(num_non_singleton_blocks,
                len(block_starts) - num_non_singleton_blocks))

    block_readsets = split_readset(readset, ext_block_starts, index)
    timers.stop("detecting_blocks")

    # Process blocks independently
    (
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
    ) = ([], [], [], [], [])
    processed_non_singleton_blocks = 0
    for block_id, block_readset in enumerate(block_readsets):
        block_start = ext_block_starts[block_id]
        block_end = ext_block_starts[block_id + 1]
        block_num_vars = block_end - block_start

        assert len(block_readset.get_positions()) == block_num_vars

        if block_num_vars > 1:
            # Only print for non-singleton block
            processed_non_singleton_blocks += 1
            logger.info(
                "Processing block {} of {} with {} reads and {} variants.".
                format(
                    processed_non_singleton_blocks,
                    num_non_singleton_blocks,
                    len(block_readset),
                    block_num_vars,
                ))

        genotype_slice = genotype_list[block_start:block_end]
        clustering, path, haplotypes, cut_positions, haploid_cuts = phase_single_block(
            block_readset, genotype_slice, phasing_param, timers)

        blockwise_clustering.append(clustering)
        blockwise_paths.append(path)
        blockwise_haplotypes.append(haplotypes)
        blockwise_cut_positions.append(cut_positions)
        blockwise_haploid_cuts.append(haploid_cuts)

    # Aggregate blockwise results
    clustering, threading, haplotypes, cut_positions, haploid_cuts = aggregate_phasing_blocks(
        block_starts,
        block_readsets,
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
        phasing_param,
    )

    # Summarize data for VCF file
    accessible_positions = sorted(readset.get_positions())
    components = {}
    haploid_components = {}

    ext_cuts = cut_positions + [num_vars]
    for i, cut_pos in enumerate(cut_positions):
        for pos in range(ext_cuts[i], ext_cuts[i + 1]):
            components[accessible_positions[pos]] = accessible_positions[
                ext_cuts[i]]
            components[accessible_positions[pos] +
                       1] = accessible_positions[ext_cuts[i]]
            haploid_components[
                accessible_positions[pos]] = [0] * phasing_param.ploidy
            haploid_components[accessible_positions[pos] +
                               1] = [0] * phasing_param.ploidy

    for j in range(phasing_param.ploidy):
        ext_cuts = haploid_cuts[j] + [num_vars]
        for i, cut_pos in enumerate(haploid_cuts[j]):
            for pos in range(ext_cuts[i], ext_cuts[i + 1]):
                haploid_components[accessible_positions[pos]][
                    j] = accessible_positions[ext_cuts[i]]
                haploid_components[accessible_positions[pos] +
                                   1][j] = accessible_positions[ext_cuts[i]]

    superreads = ReadSet()
    for i in range(phasing_param.ploidy):
        read = Read("superread {}".format(i + 1), 0, 0)
        # insert alleles
        for j, allele in enumerate(haplotypes[i]):
            if allele == "n":
                continue
            allele = int(allele)
            # TODO: Needs changes for multi-allelic and we might give an actual quality value
            read.add_variant(accessible_positions[j], allele, 0)
        superreads.add(read)

    # Plot option
    if phasing_param.plot_clusters or phasing_param.plot_threading:
        timers.start("create_plots")
        draw_plots(
            block_readsets,
            clustering,
            threading,
            haplotypes,
            cut_positions,
            genotype_list,
            phasable_variant_table,
            phasing_param,
            output,
        )
        timers.stop("create_plots")

    # Return results
    return components, haploid_components, superreads
Example #6
0
def test_auxiliary_datastructures():
    # test postion map
    readset, var_pos, _, _ = create_testinstance1()
    index, rev_index = get_position_map(readset)
    for i in range(len(var_pos)):
        assert index[var_pos[i]] == i
    assert rev_index == var_pos

    # test relative coverage
    clustering = [
        [0, 4, 6],
        [1, 2],
        [7, 10, 13],
        [9, 12, 14],
        [3, 5, 8, 11],
        [15, 16],
        [17],
        [18],
    ]
    cov = get_coverage(readset, clustering, index)
    assert cov[0] == {0: 0.5, 1: 0.5}
    assert cov[1] == {0: 0.25, 1: 0.5, 4: 0.25}
    assert cov[2] == {0: 1 / 3, 1: 1 / 3, 4: 1 / 3}
    assert cov[3] == {0: 3 / 7, 1: 2 / 7, 4: 2 / 7}
    assert cov[4] == {0: 3 / 8, 1: 2 / 8, 4: 3 / 8}
    assert cov[5] == {0: 3 / 9, 1: 2 / 9, 4: 4 / 9}
    assert cov[6] == {0: 3 / 9, 1: 2 / 9, 4: 4 / 9}
    assert cov[7] == {0: 2 / 9, 1: 2 / 9, 2: 1 / 9, 4: 4 / 9}
    assert cov[8] == {0: 2 / 10, 1: 1 / 10, 2: 2 / 10, 3: 1 / 10, 4: 4 / 10}
    assert cov[9] == {0: 2 / 11, 1: 1 / 11, 2: 2 / 11, 3: 2 / 11, 4: 4 / 11}
    assert cov[10] == {0: 1 / 11, 2: 3 / 11, 3: 3 / 11, 4: 4 / 11}
    assert cov[11] == {0: 1 / 10, 2: 3 / 10, 3: 3 / 10, 4: 3 / 10}
    assert cov[12] == {2: 3 / 8, 3: 3 / 8, 4: 2 / 8}
    assert cov[13] == {2: 3 / 7, 3: 3 / 7, 4: 1 / 7}
    assert cov[14] == {2: 3 / 8, 3: 3 / 8, 5: 2 / 8}
    assert cov[15] == {2: 3 / 8, 3: 3 / 8, 5: 2 / 8}
    assert cov[16] == {2: 3 / 10, 3: 3 / 10, 5: 2 / 10, 6: 1 / 10, 7: 1 / 10}
    assert cov[17] == {2: 2 / 9, 3: 3 / 9, 5: 2 / 9, 6: 1 / 9, 7: 1 / 9}
    assert cov[18] == {2: 1 / 7, 3: 2 / 7, 5: 2 / 7, 6: 1 / 7, 7: 1 / 7}
    assert cov[19] == {2: 1 / 6, 3: 1 / 6, 5: 2 / 6, 6: 1 / 6, 7: 1 / 6}
    assert cov[20] == {5: 2 / 4, 6: 1 / 4, 7: 1 / 4}
    assert cov[21] == {5: 2 / 4, 6: 1 / 4, 7: 1 / 4}

    # test absolute coverage
    abs_cov = get_coverage_absolute(readset, clustering, index)
    assert abs_cov[0] == {0: 1, 1: 1}
    assert abs_cov[1] == {0: 1, 1: 2, 4: 1}
    assert abs_cov[2] == {0: 2, 1: 2, 4: 2}
    assert abs_cov[3] == {0: 3, 1: 2, 4: 2}
    assert abs_cov[4] == {0: 3, 1: 2, 4: 3}
    assert abs_cov[5] == {0: 3, 1: 2, 4: 4}
    assert abs_cov[6] == {0: 3, 1: 2, 4: 4}
    assert abs_cov[7] == {0: 2, 1: 2, 2: 1, 4: 4}
    assert abs_cov[8] == {0: 2, 1: 1, 2: 2, 3: 1, 4: 4}
    assert abs_cov[9] == {0: 2, 1: 1, 2: 2, 3: 2, 4: 4}
    assert abs_cov[10] == {0: 1, 2: 3, 3: 3, 4: 4}
    assert abs_cov[11] == {0: 1, 2: 3, 3: 3, 4: 3}
    assert abs_cov[12] == {2: 3, 3: 3, 4: 2}
    assert abs_cov[13] == {2: 3, 3: 3, 4: 1}
    assert abs_cov[14] == {2: 3, 3: 3, 5: 2}
    assert abs_cov[15] == {2: 3, 3: 3, 5: 2}
    assert abs_cov[16] == {2: 3, 3: 3, 5: 2, 6: 1, 7: 1}
    assert abs_cov[17] == {2: 2, 3: 3, 5: 2, 6: 1, 7: 1}
    assert abs_cov[18] == {2: 1, 3: 2, 5: 2, 6: 1, 7: 1}
    assert abs_cov[19] == {2: 1, 3: 1, 5: 2, 6: 1, 7: 1}
    assert abs_cov[20] == {5: 2, 6: 1, 7: 1}
    assert abs_cov[21] == {5: 2, 6: 1, 7: 1}
Example #7
0
def phase_single_individual(readset, phasable_variant_table, sample, phasing_param, output, timers):

    # Compute the genotypes that belong to the variant table and create a list of all genotypes
    genotype_list = create_genotype_list(phasable_variant_table, sample)

    # Select reads, only for debug
    # selected_reads = select_reads(readset, 120, preferred_source_ids = vcf_source_ids)
    # readset = selected_reads

    # Precompute block borders based on read coverage and linkage between variants
    logger.info("Detecting connected components with weak interconnect ..")
    timers.start("detecting_blocks")
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)
    if phasing_param.block_cut_sensitivity == 0:
        block_starts = [0]
    elif phasing_param.block_cut_sensitivity == 1:
        block_starts = compute_linkage_based_block_starts(
            readset, index, phasing_param.ploidy, single_linkage=True
        )
    else:
        block_starts = compute_linkage_based_block_starts(
            readset, index, phasing_param.ploidy, single_linkage=False
        )

    # Set block borders and split readset
    ext_block_starts = block_starts + [num_vars]
    num_non_singleton_blocks = len(
        [i for i in range(len(block_starts)) if ext_block_starts[i] < ext_block_starts[i + 1] - 1]
    )
    logger.info(
        "Split heterozygous variants into {} blocks (and {} singleton blocks).".format(
            num_non_singleton_blocks, len(block_starts) - num_non_singleton_blocks
        )
    )

    block_readsets = split_readset(readset, ext_block_starts, index)
    timers.stop("detecting_blocks")

    # Process blocks independently
    (
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
    ) = ([], [], [], [], [])

    # Create genotype slices for blocks
    genotype_slices = []
    for block_id, block_readset in enumerate(block_readsets):
        block_start = ext_block_starts[block_id]
        block_end = ext_block_starts[block_id + 1]
        block_num_vars = block_end - block_start

        assert len(block_readset.get_positions()) == block_num_vars
        genotype_slices.append(genotype_list[block_start:block_end])

    processed_non_singleton_blocks = 0
    # use process pool for multiple threads
    if phasing_param.threads == 1:
        # for single-threading, process everything individually to minimize memory footprint
        for block_id, block_readset in enumerate(block_readsets):
            block_num_vars = ext_block_starts[block_id + 1] - ext_block_starts[block_id]
            if block_num_vars > 1:
                # Only print for non-singleton block
                processed_non_singleton_blocks += 1
                logger.info(
                    "Processing block {} of {} with {} reads and {} variants.".format(
                        processed_non_singleton_blocks,
                        num_non_singleton_blocks,
                        len(block_readset),
                        block_num_vars,
                    )
                )

            clustering, path, haplotypes, cut_positions, haploid_cuts = phase_single_block(
                block_readset, genotype_slices[block_id], phasing_param, timers
            )

            blockwise_clustering.append(clustering)
            blockwise_paths.append(path)
            blockwise_haplotypes.append(haplotypes)
            blockwise_cut_positions.append(cut_positions)
            blockwise_haploid_cuts.append(haploid_cuts)

    else:
        # sort block readsets in descending order by number of reads
        joblist = [(i, len(block_readsets[i])) for i in range(len(block_readsets))]
        joblist.sort(key=lambda x: -x[1])

        timers.start("phase_blocks")

        # process large jobs first, 4/3-approximation for scheduling problem
        with Pool(processes=phasing_param.threads) as pool:
            """
            TODO: Python's multiprocessing makes hard copies of the passed
            arguments, which is not trivial for cython objects, especially when they
            contain pointers to other cython objects. Any passed object must be
            (de)serializable (in Python: pickle).
            All other objects created in the main thread are also accessible by the
            workers, but they are handled via the copy-on-write policy. This means,
            that e.g. the large main readset is not hardcopied for every thread,
            as long as it is not modified there. Since this would cause a massive
            waste of memory, this must not be done and the main readset must
            also never be passed as argument to the workers.
            """
            process_results = [
                pool.apply_async(
                    phase_single_block_mt,
                    (
                        block_readsets[block_id],
                        genotype_slices[block_id],
                        phasing_param,
                        timers,
                        block_id,
                        job_id,
                        num_non_singleton_blocks,
                    ),
                )
                for job_id, (block_id, block_readset) in enumerate(joblist)
            ]
            blockwise_results = [res.get() for res in process_results]

            # reorder results again
            blockwise_results.sort(key=lambda x: x[-1])

            # collect all blockwise results
            for (
                clustering,
                path,
                haplotypes,
                cut_positions,
                haploid_cuts,
                block_id,
            ) in blockwise_results:
                blockwise_clustering.append(clustering)
                blockwise_paths.append(path)
                blockwise_haplotypes.append(haplotypes)
                blockwise_cut_positions.append(cut_positions)
                blockwise_haploid_cuts.append(haploid_cuts)

        timers.stop("phase_blocks")

    # Aggregate blockwise results
    clustering, threading, haplotypes, cut_positions, haploid_cuts = aggregate_phasing_blocks(
        block_starts,
        block_readsets,
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
        phasing_param,
    )

    # Summarize data for VCF file
    accessible_positions = sorted(readset.get_positions())
    components = {}
    haploid_components = {}

    ext_cuts = cut_positions + [num_vars]
    for i, cut_pos in enumerate(cut_positions):
        for pos in range(ext_cuts[i], ext_cuts[i + 1]):
            components[accessible_positions[pos]] = accessible_positions[ext_cuts[i]]
            components[accessible_positions[pos] + 1] = accessible_positions[ext_cuts[i]]
            haploid_components[accessible_positions[pos]] = [0] * phasing_param.ploidy
            haploid_components[accessible_positions[pos] + 1] = [0] * phasing_param.ploidy

    for j in range(phasing_param.ploidy):
        ext_cuts = haploid_cuts[j] + [num_vars]
        for i, cut_pos in enumerate(haploid_cuts[j]):
            for pos in range(ext_cuts[i], ext_cuts[i + 1]):
                haploid_components[accessible_positions[pos]][j] = accessible_positions[ext_cuts[i]]
                haploid_components[accessible_positions[pos] + 1][j] = accessible_positions[
                    ext_cuts[i]
                ]

    superreads = ReadSet()
    for i in range(phasing_param.ploidy):
        read = Read("superread {}".format(i + 1), 0, 0)
        # insert alleles
        for j, allele in enumerate(haplotypes[i]):
            if allele == "n":
                continue
            allele = int(allele)
            # TODO: Needs changes for multi-allelic and we might give an actual quality value
            read.add_variant(accessible_positions[j], allele, 0)
        superreads.add(read)

    # Plot option
    if phasing_param.plot_clusters or phasing_param.plot_threading:
        timers.start("create_plots")
        draw_plots(
            block_readsets,
            clustering,
            threading,
            haplotypes,
            cut_positions,
            genotype_list,
            phasable_variant_table,
            phasing_param.plot_clusters,
            phasing_param.plot_threading,
            output,
        )
        timers.stop("create_plots")

    # Return results
    return components, haploid_components, superreads