Esempio n. 1
0
def find_inconsistencies(readset, clustering, ploidy):
    # Returns the number of cluster positions with inconsistencies
    # (counts position multiple times, if multiple clusters are inconsistent there)
    # Also returns a list of read pairs, which need to be seperated
    num_inconsistent_positions = 0
    separated_pairs = []
    exp_error = 0.05
    p_val_threshold = 0.02

    # Compute consensus and coverage
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)

    coverage = get_coverage(readset, clustering, index)
    cov_map = get_pos_to_clusters_map(coverage, ploidy)
    positions = get_cluster_start_end_positions(readset, clustering, index)
    abs_coverage = get_coverage_absolute(readset, clustering, index)
    consensus = get_local_cluster_consensus_withfrac(readset, clustering,
                                                     cov_map, positions)

    # Search for positions in clusters with ambivalent consensus
    for pos in range(num_vars):
        # print(str(pos)+" -> "+str(len(coverage[pos]))+" , "+str(len(consensus[pos])))
        for c_id in coverage[pos]:
            if c_id not in consensus[pos]:
                continue
            # do binomial hypothesis test, whether the deviations from majority allele is significant enough for splitting
            abs_count = abs_coverage[pos][c_id]
            abs_deviations = int(abs_count * (1 - consensus[pos][c_id][1]))
            p_val = binom_test(abs_deviations,
                               abs_count,
                               exp_error,
                               alternative="greater")
            if p_val < p_val_threshold:
                # print("   inconsistency in cluster "+str(c_id)+" at position"+str(pos)+" with coverage "+str(coverage[pos][c_id])+" and consensus "+str(consensus[pos][c_id]))
                num_inconsistent_positions += 1
                zero_reads = []
                one_reads = []
                for read in clustering[c_id]:
                    for var in readset[read]:
                        if index[var.position] == pos:
                            if var.allele == 0:
                                zero_reads.append(read)
                            else:
                                one_reads.append(read)
                for r0 in zero_reads:
                    for r1 in one_reads:
                        separated_pairs.append((r0, r1))

    return num_inconsistent_positions, separated_pairs
Esempio n. 2
0
def test_auxiliary_datastructures():
    # test postion map
    readset, var_pos, _, _ = create_testinstance1()
    index, rev_index = get_position_map(readset)
    for i in range(len(var_pos)):
        assert index[var_pos[i]] == i
    assert rev_index == var_pos

    # test relative coverage
    clustering = [
        [0, 4, 6],
        [1, 2],
        [7, 10, 13],
        [9, 12, 14],
        [3, 5, 8, 11],
        [15, 16],
        [17],
        [18],
    ]
    cov = get_coverage(readset, clustering, index)
    assert cov[0] == {0: 0.5, 1: 0.5}
    assert cov[1] == {0: 0.25, 1: 0.5, 4: 0.25}
    assert cov[2] == {0: 1 / 3, 1: 1 / 3, 4: 1 / 3}
    assert cov[3] == {0: 3 / 7, 1: 2 / 7, 4: 2 / 7}
    assert cov[4] == {0: 3 / 8, 1: 2 / 8, 4: 3 / 8}
    assert cov[5] == {0: 3 / 9, 1: 2 / 9, 4: 4 / 9}
    assert cov[6] == {0: 3 / 9, 1: 2 / 9, 4: 4 / 9}
    assert cov[7] == {0: 2 / 9, 1: 2 / 9, 2: 1 / 9, 4: 4 / 9}
    assert cov[8] == {0: 2 / 10, 1: 1 / 10, 2: 2 / 10, 3: 1 / 10, 4: 4 / 10}
    assert cov[9] == {0: 2 / 11, 1: 1 / 11, 2: 2 / 11, 3: 2 / 11, 4: 4 / 11}
    assert cov[10] == {0: 1 / 11, 2: 3 / 11, 3: 3 / 11, 4: 4 / 11}
    assert cov[11] == {0: 1 / 10, 2: 3 / 10, 3: 3 / 10, 4: 3 / 10}
    assert cov[12] == {2: 3 / 8, 3: 3 / 8, 4: 2 / 8}
    assert cov[13] == {2: 3 / 7, 3: 3 / 7, 4: 1 / 7}
    assert cov[14] == {2: 3 / 8, 3: 3 / 8, 5: 2 / 8}
    assert cov[15] == {2: 3 / 8, 3: 3 / 8, 5: 2 / 8}
    assert cov[16] == {2: 3 / 10, 3: 3 / 10, 5: 2 / 10, 6: 1 / 10, 7: 1 / 10}
    assert cov[17] == {2: 2 / 9, 3: 3 / 9, 5: 2 / 9, 6: 1 / 9, 7: 1 / 9}
    assert cov[18] == {2: 1 / 7, 3: 2 / 7, 5: 2 / 7, 6: 1 / 7, 7: 1 / 7}
    assert cov[19] == {2: 1 / 6, 3: 1 / 6, 5: 2 / 6, 6: 1 / 6, 7: 1 / 6}
    assert cov[20] == {5: 2 / 4, 6: 1 / 4, 7: 1 / 4}
    assert cov[21] == {5: 2 / 4, 6: 1 / 4, 7: 1 / 4}

    # test absolute coverage
    abs_cov = get_coverage_absolute(readset, clustering, index)
    assert abs_cov[0] == {0: 1, 1: 1}
    assert abs_cov[1] == {0: 1, 1: 2, 4: 1}
    assert abs_cov[2] == {0: 2, 1: 2, 4: 2}
    assert abs_cov[3] == {0: 3, 1: 2, 4: 2}
    assert abs_cov[4] == {0: 3, 1: 2, 4: 3}
    assert abs_cov[5] == {0: 3, 1: 2, 4: 4}
    assert abs_cov[6] == {0: 3, 1: 2, 4: 4}
    assert abs_cov[7] == {0: 2, 1: 2, 2: 1, 4: 4}
    assert abs_cov[8] == {0: 2, 1: 1, 2: 2, 3: 1, 4: 4}
    assert abs_cov[9] == {0: 2, 1: 1, 2: 2, 3: 2, 4: 4}
    assert abs_cov[10] == {0: 1, 2: 3, 3: 3, 4: 4}
    assert abs_cov[11] == {0: 1, 2: 3, 3: 3, 4: 3}
    assert abs_cov[12] == {2: 3, 3: 3, 4: 2}
    assert abs_cov[13] == {2: 3, 3: 3, 4: 1}
    assert abs_cov[14] == {2: 3, 3: 3, 5: 2}
    assert abs_cov[15] == {2: 3, 3: 3, 5: 2}
    assert abs_cov[16] == {2: 3, 3: 3, 5: 2, 6: 1, 7: 1}
    assert abs_cov[17] == {2: 2, 3: 3, 5: 2, 6: 1, 7: 1}
    assert abs_cov[18] == {2: 1, 3: 2, 5: 2, 6: 1, 7: 1}
    assert abs_cov[19] == {2: 1, 3: 1, 5: 2, 6: 1, 7: 1}
    assert abs_cov[20] == {5: 2, 6: 1, 7: 1}
    assert abs_cov[21] == {5: 2, 6: 1, 7: 1}