コード例 #1
0
def test_selection_with_preferred_sources():
    readset = string_to_readset("""
	  1        1
	""", source_id=3)
    more_reads = string_to_readset("""
	  1111
	     111
	        1111
	""",
                                   source_id=1)

    for read in more_reads:
        readset.add(read)

    selected_reads = readselection(readset,
                                   max_cov=2,
                                   preferred_source_ids=None,
                                   bridging=True)
    assert selected_reads == set([1, 2, 3]), str(selected_reads)

    selected_reads = readselection(readset,
                                   max_cov=2,
                                   preferred_source_ids=set([3]),
                                   bridging=True)
    assert selected_reads == set([0, 1, 3]), str(selected_reads)
コード例 #2
0
def test_read_merging():
    reads = string_to_readset(
        """
      0 000000
      111
      11 00111101
      0 00000
    """,
        """
      1 523428
      714
      86 03158958
      8 46626
    """,
    )

    merger = ReadMerger(0.15, 0.25, 100000, 1000)
    merged_reads = merger.merge(reads)
    # default parameter settings

    expected = string_to_readset(
        """
      0 000000
      111
      11 00111101
    """,
        """
      9 989688
      714
      86 03158958
    """,
    )

    assert_variants(merged_reads, expected)
コード例 #3
0
def check_genotyping_single_individual(
    reads,
    weights=None,
    expected=None,
    genotypes=None,
    scaling=None,
    genotype_priors=None,
):
    # 0) set up read set
    readset = string_to_readset(s=reads, w=weights, scale_quality=scaling)
    positions = readset.get_positions()

    # 1) Genotype using forward backward algorithm
    recombcost = [1] * len(positions)
    numeric_sample_ids = NumericSampleIds()
    pedigree = Pedigree(numeric_sample_ids)
    genotype_likelihoods = [
        PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])
    ] * len(positions)

    if genotype_priors is not None:
        genotype_likelihoods = genotype_priors

    pedigree.add_individual(
        "individual0",
        [canonic_index_to_biallelic_gt(1) for i in range(len(positions))],
        genotype_likelihoods,
    )
    dp_forward_backward = GenotypeDPTable(numeric_sample_ids, readset,
                                          recombcost, pedigree)

    # check the results
    compare_to_expected(dp_forward_backward, positions, expected, genotypes)
コード例 #4
0
def test_selection():
    reads = string_to_readset("""
	  1  1
	  00
	  0   1
	  10  1
	  1   1
	    11
	  0   1
	  1    1
	""")
    selected_reads = readselection(reads,
                                   max_cov=1,
                                   preferred_source_ids=None,
                                   bridging=False)
    assert selected_reads == set([1, 5])
    selected_reads = readselection(reads,
                                   max_cov=2,
                                   preferred_source_ids=None,
                                   bridging=False)
    assert selected_reads == set([1, 3, 5]), str(selected_reads)
    selected_reads = readselection(reads,
                                   max_cov=3,
                                   preferred_source_ids=None,
                                   bridging=False)
    assert selected_reads == set([1, 3, 5, 7]), str(selected_reads)
    selected_reads = readselection(reads,
                                   max_cov=3,
                                   preferred_source_ids=None,
                                   bridging=True)
    #Here the assert is wrong, because the bridging doesn't come into account , because in the slice_read the selected
    # reads  have already coverage 3 by set ([1,3,5,7]) because first each position has to covered at least once before
    #the bridging starts
    assert selected_reads == set([1, 3, 5, 7]), str(selected_reads)
コード例 #5
0
def test_clusterediting2():

    reads = """
        000000 00 0 00000 0000 0
             1111 11111
               000 00000 0000000
               111111111
                 1000000000
                  0 00000
                    11111
                    1 1 1111 1111111111
                    111111111111
        """

    # construct a ReadSet
    readset = string_to_readset(reads)

    # compute similarities
    similarities = scoreReadsetGlobal(readset, 5, 2)
    print(similarities)

    # run cluster editing
    clusterediting = ClusterEditingSolver(similarities, False)
    readpartitioning = clusterediting.run()

    print("computed clusters: ", readpartitioning)

    # make sure each read occurs only once
    read_ids = list(itertools.chain.from_iterable(readpartitioning))
    duplicates = set([r for r in read_ids if read_ids.count(r) > 1])
    print("duplicates:", duplicates)
    assert len(duplicates) == 0
コード例 #6
0
def test_similarities1():
    reads = """
    001001
    110101
    """
    readset = string_to_readset(reads)
    similarities = scoreReadsetGlobal(readset, 4, 2)
    # computed similarity is 'nan'
    print("computed similarities:", similarities)
    assert not math.isnan(similarities.get(0, 1))
コード例 #7
0
def test_string():
    reads = """
	  0             0
	  110111111111
	  00100
	       0001000000
	       000
	        10100
	              101
	"""
    rs = string_to_readset(reads)
    verify(rs, True)
    verify(rs, False)
コード例 #8
0
def test_selection2():
    reads = string_to_readset("""
	  1111
	     111
	     1  111
	     1     11
	    1      11
	""")
    selected_reads = readselection(reads,
                                   max_cov=4,
                                   preferred_source_ids=None,
                                   bridging=False)
    assert selected_reads == set([0, 1, 2, 3]), str(selected_reads)
コード例 #9
0
def test_similarities2():
    reads = """
    00000
    00000
    00000
    00000
    11111
    11111
    10101
    10101
    """
    readset = string_to_readset(reads)
    similarities = scoreReadsetGlobal(readset, 4, 4)
    print("computed similarities:", similarities)
コード例 #10
0
def test_read_merging2():
    reads = string_to_readset(
        """
	  0 000000
	  111
	  11 00111101
	  0 00000
	""", """
	  1 523428
	  714
	  86 03158958
	  8 46626
	""")

    merged_reads = merge_reads(reads, 0.5, 0.5, 1000, 100000)
    # error rates and thresholds so high that no merging occurs

    assert_variants(merged_reads, reads)
コード例 #11
0
def test_components_of_readselection():
    reads = string_to_readset("""
	  111
	     000
	  00
	      00
	   1   1
	""")
    selected_reads = readselection(reads,
                                   max_cov=2,
                                   preferred_source_ids=None,
                                   bridging=False)
    assert selected_reads == set([0, 1, 2, 3]), str(selected_reads)
    #	assert len(set(new_components.values())) == 2
    selected_reads = readselection(reads,
                                   max_cov=2,
                                   preferred_source_ids=None,
                                   bridging=True)
    assert selected_reads == set([0, 1, 4]), str(selected_reads)
コード例 #12
0
def bridging():
    reads = string_to_readset("""
	  11
	  00
	    11
	    00
	      11
	      00
	  1    1
	""")
    selected_reads = readselection(reads,
                                   max_cov=2,
                                   preferred_source_ids=None,
                                   bridging=False)
    assert selected_reads == set([0, 1, 2, 3, 4, 5])
    selected_reads = readselection(reads,
                                   max_cov=2,
                                   preferred_source_ids=None,
                                   bridging=True)
    #Not sure why 0 is there selected and not 1...
    assert selected_reads == set([0, 3, 5, 6])
コード例 #13
0
def test_clusterediting3():
    reads = """
    0010111110111111111001111
    111111111111111111111 111
    011011111011111 111001111
     11 11111111 111111111111
    1111111111111111111111 11
    0010111110111111111001111
    111111111111111111111 111
    011011111011111 111001111
    011011111011111 111001111
    """
    # construct a ReadSet
    readset = string_to_readset(reads)

    # compute similarities
    similarities = scoreReadsetGlobal(readset, 5, 3)
    print(similarities)

    # run cluster editing
    clusterediting = ClusterEditingSolver(similarities, False)
    readpartitioning = clusterediting.run()

    print("computed clusters: ", readpartitioning)
コード例 #14
0
ファイル: test_phasing.py プロジェクト: pontushojer/whatshap
def check_phasing_single_individual(reads, algorithm="whatshap", weights=None):
    # 0) set up read set
    readset = string_to_readset(reads, weights)
    positions = readset.get_positions()

    # for hapchat
    if algorithm == "hapchat":
        dp_table = HapChatCore(readset)
        superreads = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0][0], cost, partition, readset,
                                    True, weights, algorithm)
        return

    # 1) Phase using PedMEC code for single individual
    for all_heterozygous in [False, True]:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
        ] * len(positions)
        pedigree.add_individual(
            "individual0",
            [canonic_index_to_biallelic_gt(1) for i in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        superreads, transmission_vector = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
        # TODO: transmission vectors not returned properly, see issue 73
        assert len(set(transmission_vector)) == 1
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0], cost, partition, readset,
                                    all_heterozygous, weights)

    # 2) Phase using PedMEC code for trios with two "empty" individuals (i.e. having no reads)
    for all_heterozygous in [False, True]:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
        ] * len(positions)
        pedigree.add_individual(
            "individual0",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_individual(
            "individual1",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_individual(
            "individual2",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_relationship("individual0", "individual1", "individual2")
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        cost = dp_table.get_optimal_cost()
        superreads, transmission_vector = dp_table.get_super_reads()
        assert len(set(transmission_vector)) == 1
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0], cost, partition, readset,
                                    all_heterozygous, weights)