Example #1
0
def test_path_with_affine():
    readset, var_pos, clustering, genotypes = create_testinstance1()
    ploidy = 3

    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)
    positions = get_cluster_start_end_positions(readset, clustering, index)
    coverage = get_coverage(readset, clustering, index)
    cov_map = get_pos_to_clusters_map(coverage, ploidy)
    consensus = get_local_cluster_consensus(readset, clustering, cov_map, positions)

    path = compute_threading_path(
        readset, clustering, num_vars, coverage, cov_map, consensus, ploidy, genotypes
    )
    cluster_paths = ["".join([str(path[i][j]) for i in range(len(path))]) for j in range(3)]

    first_block = set([cluster_paths[0][:9], cluster_paths[1][:9], cluster_paths[2][:9]])
    first_truth = set(["000000000", "111111111", "044444444"])
    second_block = set([cluster_paths[0][9:20], cluster_paths[1][9:20], cluster_paths[2][9:20]])
    second_truth = set(["33333333333", "22222222222", "44444555555"])
    third_block = set([cluster_paths[0][20:], cluster_paths[1][20:], cluster_paths[2][20:]])
    third_truth = set(["66", "77", "55"])

    print(cluster_paths)

    assert first_block == first_truth
    assert second_block == second_truth
    assert third_block == third_truth
Example #2
0
def find_inconsistencies(readset, clustering, ploidy):
    # Returns the number of cluster positions with inconsistencies
    # (counts position multiple times, if multiple clusters are inconsistent there)
    # Also returns a list of read pairs, which need to be seperated
    num_inconsistent_positions = 0
    separated_pairs = []
    exp_error = 0.05
    p_val_threshold = 0.02

    # Compute consensus and coverage
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)

    coverage = get_coverage(readset, clustering, index)
    cov_map = get_pos_to_clusters_map(coverage, ploidy)
    positions = get_cluster_start_end_positions(readset, clustering, index)
    abs_coverage = get_coverage_absolute(readset, clustering, index)
    consensus = get_local_cluster_consensus_withfrac(readset, clustering,
                                                     cov_map, positions)

    # Search for positions in clusters with ambivalent consensus
    for pos in range(num_vars):
        # print(str(pos)+" -> "+str(len(coverage[pos]))+" , "+str(len(consensus[pos])))
        for c_id in coverage[pos]:
            if c_id not in consensus[pos]:
                continue
            # do binomial hypothesis test, whether the deviations from majority allele is significant enough for splitting
            abs_count = abs_coverage[pos][c_id]
            abs_deviations = int(abs_count * (1 - consensus[pos][c_id][1]))
            p_val = binom_test(abs_deviations,
                               abs_count,
                               exp_error,
                               alternative="greater")
            if p_val < p_val_threshold:
                # print("   inconsistency in cluster "+str(c_id)+" at position"+str(pos)+" with coverage "+str(coverage[pos][c_id])+" and consensus "+str(consensus[pos][c_id]))
                num_inconsistent_positions += 1
                zero_reads = []
                one_reads = []
                for read in clustering[c_id]:
                    for var in readset[read]:
                        if index[var.position] == pos:
                            if var.allele == 0:
                                zero_reads.append(read)
                            else:
                                one_reads.append(read)
                for r0 in zero_reads:
                    for r1 in one_reads:
                        separated_pairs.append((r0, r1))

    return num_inconsistent_positions, separated_pairs