Esempio n. 1
0
def test_path_with_affine():
    readset, var_pos, clustering, genotypes = create_testinstance1()
    ploidy = 3

    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)
    positions = get_cluster_start_end_positions(readset, clustering, index)
    coverage = get_coverage(readset, clustering, index)
    cov_map = get_pos_to_clusters_map(coverage, ploidy)
    consensus = get_local_cluster_consensus(readset, clustering, cov_map, positions)

    path = compute_threading_path(
        readset, clustering, num_vars, coverage, cov_map, consensus, ploidy, genotypes
    )
    cluster_paths = ["".join([str(path[i][j]) for i in range(len(path))]) for j in range(3)]

    first_block = set([cluster_paths[0][:9], cluster_paths[1][:9], cluster_paths[2][:9]])
    first_truth = set(["000000000", "111111111", "044444444"])
    second_block = set([cluster_paths[0][9:20], cluster_paths[1][9:20], cluster_paths[2][9:20]])
    second_truth = set(["33333333333", "22222222222", "44444555555"])
    third_block = set([cluster_paths[0][20:], cluster_paths[1][20:], cluster_paths[2][20:]])
    third_truth = set(["66", "77", "55"])

    print(cluster_paths)

    assert first_block == first_truth
    assert second_block == second_truth
    assert third_block == third_truth
Esempio n. 2
0
def find_inconsistencies(readset, clustering, ploidy):
    # Returns the number of cluster positions with inconsistencies
    # (counts position multiple times, if multiple clusters are inconsistent there)
    # Also returns a list of read pairs, which need to be seperated
    num_inconsistent_positions = 0
    separated_pairs = []
    exp_error = 0.05
    p_val_threshold = 0.02

    # Compute consensus and coverage
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)

    coverage = get_coverage(readset, clustering, index)
    cov_map = get_pos_to_clusters_map(coverage, ploidy)
    positions = get_cluster_start_end_positions(readset, clustering, index)
    abs_coverage = get_coverage_absolute(readset, clustering, index)
    consensus = get_local_cluster_consensus_withfrac(readset, clustering,
                                                     cov_map, positions)

    # Search for positions in clusters with ambivalent consensus
    for pos in range(num_vars):
        # print(str(pos)+" -> "+str(len(coverage[pos]))+" , "+str(len(consensus[pos])))
        for c_id in coverage[pos]:
            if c_id not in consensus[pos]:
                continue
            # do binomial hypothesis test, whether the deviations from majority allele is significant enough for splitting
            abs_count = abs_coverage[pos][c_id]
            abs_deviations = int(abs_count * (1 - consensus[pos][c_id][1]))
            p_val = binom_test(abs_deviations,
                               abs_count,
                               exp_error,
                               alternative="greater")
            if p_val < p_val_threshold:
                # print("   inconsistency in cluster "+str(c_id)+" at position"+str(pos)+" with coverage "+str(coverage[pos][c_id])+" and consensus "+str(consensus[pos][c_id]))
                num_inconsistent_positions += 1
                zero_reads = []
                one_reads = []
                for read in clustering[c_id]:
                    for var in readset[read]:
                        if index[var.position] == pos:
                            if var.allele == 0:
                                zero_reads.append(read)
                            else:
                                one_reads.append(read)
                for r0 in zero_reads:
                    for r1 in one_reads:
                        separated_pairs.append((r0, r1))

    return num_inconsistent_positions, separated_pairs
Esempio n. 3
0
def draw_plots(
    block_readsets,
    clustering,
    threading,
    haplotypes,
    cut_positions,
    genotype_list_multi,
    phasable_variant_table,
    plot_clusters,
    plot_threading,
    output,
):
    # Plot options
    logger.info("Generating plots ...")
    combined_readset = ReadSet()
    for block_readset in block_readsets:
        for read in block_readset:
            combined_readset.add(read)
    if plot_clusters:
        draw_clustering(
            combined_readset,
            clustering,
            phasable_variant_table,
            output + ".clusters.pdf",
            genome_space=False,
        )
    if plot_threading:
        index, rev_index = get_position_map(combined_readset)
        coverage = get_coverage(combined_readset, clustering, index)
        draw_threading(
            combined_readset,
            clustering,
            coverage,
            threading,
            cut_positions,
            haplotypes,
            phasable_variant_table,
            genotype_list_multi,
            output + ".threading.pdf",
        )
Esempio n. 4
0
def test_auxiliary_datastructures():
    # test postion map
    readset, var_pos, _, _ = create_testinstance1()
    index, rev_index = get_position_map(readset)
    for i in range(len(var_pos)):
        assert index[var_pos[i]] == i
    assert rev_index == var_pos

    # test relative coverage
    clustering = [
        [0, 4, 6],
        [1, 2],
        [7, 10, 13],
        [9, 12, 14],
        [3, 5, 8, 11],
        [15, 16],
        [17],
        [18],
    ]
    cov = get_coverage(readset, clustering, index)
    assert cov[0] == {0: 0.5, 1: 0.5}
    assert cov[1] == {0: 0.25, 1: 0.5, 4: 0.25}
    assert cov[2] == {0: 1 / 3, 1: 1 / 3, 4: 1 / 3}
    assert cov[3] == {0: 3 / 7, 1: 2 / 7, 4: 2 / 7}
    assert cov[4] == {0: 3 / 8, 1: 2 / 8, 4: 3 / 8}
    assert cov[5] == {0: 3 / 9, 1: 2 / 9, 4: 4 / 9}
    assert cov[6] == {0: 3 / 9, 1: 2 / 9, 4: 4 / 9}
    assert cov[7] == {0: 2 / 9, 1: 2 / 9, 2: 1 / 9, 4: 4 / 9}
    assert cov[8] == {0: 2 / 10, 1: 1 / 10, 2: 2 / 10, 3: 1 / 10, 4: 4 / 10}
    assert cov[9] == {0: 2 / 11, 1: 1 / 11, 2: 2 / 11, 3: 2 / 11, 4: 4 / 11}
    assert cov[10] == {0: 1 / 11, 2: 3 / 11, 3: 3 / 11, 4: 4 / 11}
    assert cov[11] == {0: 1 / 10, 2: 3 / 10, 3: 3 / 10, 4: 3 / 10}
    assert cov[12] == {2: 3 / 8, 3: 3 / 8, 4: 2 / 8}
    assert cov[13] == {2: 3 / 7, 3: 3 / 7, 4: 1 / 7}
    assert cov[14] == {2: 3 / 8, 3: 3 / 8, 5: 2 / 8}
    assert cov[15] == {2: 3 / 8, 3: 3 / 8, 5: 2 / 8}
    assert cov[16] == {2: 3 / 10, 3: 3 / 10, 5: 2 / 10, 6: 1 / 10, 7: 1 / 10}
    assert cov[17] == {2: 2 / 9, 3: 3 / 9, 5: 2 / 9, 6: 1 / 9, 7: 1 / 9}
    assert cov[18] == {2: 1 / 7, 3: 2 / 7, 5: 2 / 7, 6: 1 / 7, 7: 1 / 7}
    assert cov[19] == {2: 1 / 6, 3: 1 / 6, 5: 2 / 6, 6: 1 / 6, 7: 1 / 6}
    assert cov[20] == {5: 2 / 4, 6: 1 / 4, 7: 1 / 4}
    assert cov[21] == {5: 2 / 4, 6: 1 / 4, 7: 1 / 4}

    # test absolute coverage
    abs_cov = get_coverage_absolute(readset, clustering, index)
    assert abs_cov[0] == {0: 1, 1: 1}
    assert abs_cov[1] == {0: 1, 1: 2, 4: 1}
    assert abs_cov[2] == {0: 2, 1: 2, 4: 2}
    assert abs_cov[3] == {0: 3, 1: 2, 4: 2}
    assert abs_cov[4] == {0: 3, 1: 2, 4: 3}
    assert abs_cov[5] == {0: 3, 1: 2, 4: 4}
    assert abs_cov[6] == {0: 3, 1: 2, 4: 4}
    assert abs_cov[7] == {0: 2, 1: 2, 2: 1, 4: 4}
    assert abs_cov[8] == {0: 2, 1: 1, 2: 2, 3: 1, 4: 4}
    assert abs_cov[9] == {0: 2, 1: 1, 2: 2, 3: 2, 4: 4}
    assert abs_cov[10] == {0: 1, 2: 3, 3: 3, 4: 4}
    assert abs_cov[11] == {0: 1, 2: 3, 3: 3, 4: 3}
    assert abs_cov[12] == {2: 3, 3: 3, 4: 2}
    assert abs_cov[13] == {2: 3, 3: 3, 4: 1}
    assert abs_cov[14] == {2: 3, 3: 3, 5: 2}
    assert abs_cov[15] == {2: 3, 3: 3, 5: 2}
    assert abs_cov[16] == {2: 3, 3: 3, 5: 2, 6: 1, 7: 1}
    assert abs_cov[17] == {2: 2, 3: 3, 5: 2, 6: 1, 7: 1}
    assert abs_cov[18] == {2: 1, 3: 2, 5: 2, 6: 1, 7: 1}
    assert abs_cov[19] == {2: 1, 3: 1, 5: 2, 6: 1, 7: 1}
    assert abs_cov[20] == {5: 2, 6: 1, 7: 1}
    assert abs_cov[21] == {5: 2, 6: 1, 7: 1}