def test_selection_with_preferred_sources(): readset = string_to_readset(""" 1 1 """, source_id=3) more_reads = string_to_readset(""" 1111 111 1111 """, source_id=1) for read in more_reads: readset.add(read) selected_reads = readselection(readset, max_cov=2, preferred_source_ids=None, bridging=True) assert selected_reads == set([1, 2, 3]), str(selected_reads) selected_reads = readselection(readset, max_cov=2, preferred_source_ids=set([3]), bridging=True) assert selected_reads == set([0, 1, 3]), str(selected_reads)
def test_read_merging(): reads = string_to_readset( """ 0 000000 111 11 00111101 0 00000 """, """ 1 523428 714 86 03158958 8 46626 """, ) merger = ReadMerger(0.15, 0.25, 100000, 1000) merged_reads = merger.merge(reads) # default parameter settings expected = string_to_readset( """ 0 000000 111 11 00111101 """, """ 9 989688 714 86 03158958 """, ) assert_variants(merged_reads, expected)
def check_genotyping_single_individual( reads, weights=None, expected=None, genotypes=None, scaling=None, genotype_priors=None, ): # 0) set up read set readset = string_to_readset(s=reads, w=weights, scale_quality=scaling) positions = readset.get_positions() # 1) Genotype using forward backward algorithm recombcost = [1] * len(positions) numeric_sample_ids = NumericSampleIds() pedigree = Pedigree(numeric_sample_ids) genotype_likelihoods = [ PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0]) ] * len(positions) if genotype_priors is not None: genotype_likelihoods = genotype_priors pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for i in range(len(positions))], genotype_likelihoods, ) dp_forward_backward = GenotypeDPTable(numeric_sample_ids, readset, recombcost, pedigree) # check the results compare_to_expected(dp_forward_backward, positions, expected, genotypes)
def test_selection(): reads = string_to_readset(""" 1 1 00 0 1 10 1 1 1 11 0 1 1 1 """) selected_reads = readselection(reads, max_cov=1, preferred_source_ids=None, bridging=False) assert selected_reads == set([1, 5]) selected_reads = readselection(reads, max_cov=2, preferred_source_ids=None, bridging=False) assert selected_reads == set([1, 3, 5]), str(selected_reads) selected_reads = readselection(reads, max_cov=3, preferred_source_ids=None, bridging=False) assert selected_reads == set([1, 3, 5, 7]), str(selected_reads) selected_reads = readselection(reads, max_cov=3, preferred_source_ids=None, bridging=True) #Here the assert is wrong, because the bridging doesn't come into account , because in the slice_read the selected # reads have already coverage 3 by set ([1,3,5,7]) because first each position has to covered at least once before #the bridging starts assert selected_reads == set([1, 3, 5, 7]), str(selected_reads)
def test_clusterediting2(): reads = """ 000000 00 0 00000 0000 0 1111 11111 000 00000 0000000 111111111 1000000000 0 00000 11111 1 1 1111 1111111111 111111111111 """ # construct a ReadSet readset = string_to_readset(reads) # compute similarities similarities = scoreReadsetGlobal(readset, 5, 2) print(similarities) # run cluster editing clusterediting = ClusterEditingSolver(similarities, False) readpartitioning = clusterediting.run() print("computed clusters: ", readpartitioning) # make sure each read occurs only once read_ids = list(itertools.chain.from_iterable(readpartitioning)) duplicates = set([r for r in read_ids if read_ids.count(r) > 1]) print("duplicates:", duplicates) assert len(duplicates) == 0
def test_similarities1(): reads = """ 001001 110101 """ readset = string_to_readset(reads) similarities = scoreReadsetGlobal(readset, 4, 2) # computed similarity is 'nan' print("computed similarities:", similarities) assert not math.isnan(similarities.get(0, 1))
def test_string(): reads = """ 0 0 110111111111 00100 0001000000 000 10100 101 """ rs = string_to_readset(reads) verify(rs, True) verify(rs, False)
def test_selection2(): reads = string_to_readset(""" 1111 111 1 111 1 11 1 11 """) selected_reads = readselection(reads, max_cov=4, preferred_source_ids=None, bridging=False) assert selected_reads == set([0, 1, 2, 3]), str(selected_reads)
def test_similarities2(): reads = """ 00000 00000 00000 00000 11111 11111 10101 10101 """ readset = string_to_readset(reads) similarities = scoreReadsetGlobal(readset, 4, 4) print("computed similarities:", similarities)
def test_read_merging2(): reads = string_to_readset( """ 0 000000 111 11 00111101 0 00000 """, """ 1 523428 714 86 03158958 8 46626 """) merged_reads = merge_reads(reads, 0.5, 0.5, 1000, 100000) # error rates and thresholds so high that no merging occurs assert_variants(merged_reads, reads)
def test_components_of_readselection(): reads = string_to_readset(""" 111 000 00 00 1 1 """) selected_reads = readselection(reads, max_cov=2, preferred_source_ids=None, bridging=False) assert selected_reads == set([0, 1, 2, 3]), str(selected_reads) # assert len(set(new_components.values())) == 2 selected_reads = readselection(reads, max_cov=2, preferred_source_ids=None, bridging=True) assert selected_reads == set([0, 1, 4]), str(selected_reads)
def bridging(): reads = string_to_readset(""" 11 00 11 00 11 00 1 1 """) selected_reads = readselection(reads, max_cov=2, preferred_source_ids=None, bridging=False) assert selected_reads == set([0, 1, 2, 3, 4, 5]) selected_reads = readselection(reads, max_cov=2, preferred_source_ids=None, bridging=True) #Not sure why 0 is there selected and not 1... assert selected_reads == set([0, 3, 5, 6])
def test_clusterediting3(): reads = """ 0010111110111111111001111 111111111111111111111 111 011011111011111 111001111 11 11111111 111111111111 1111111111111111111111 11 0010111110111111111001111 111111111111111111111 111 011011111011111 111001111 011011111011111 111001111 """ # construct a ReadSet readset = string_to_readset(reads) # compute similarities similarities = scoreReadsetGlobal(readset, 5, 3) print(similarities) # run cluster editing clusterediting = ClusterEditingSolver(similarities, False) readpartitioning = clusterediting.run() print("computed clusters: ", readpartitioning)
def check_phasing_single_individual(reads, algorithm="whatshap", weights=None): # 0) set up read set readset = string_to_readset(reads, weights) positions = readset.get_positions() # for hapchat if algorithm == "hapchat": dp_table = HapChatCore(readset) superreads = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0][0], cost, partition, readset, True, weights, algorithm) return # 1) Phase using PedMEC code for single individual for all_heterozygous in [False, True]: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for i in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) superreads, transmission_vector = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() # TODO: transmission vectors not returned properly, see issue 73 assert len(set(transmission_vector)) == 1 partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0], cost, partition, readset, all_heterozygous, weights) # 2) Phase using PedMEC code for trios with two "empty" individuals (i.e. having no reads) for all_heterozygous in [False, True]: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_individual( "individual1", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_individual( "individual2", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_relationship("individual0", "individual1", "individual2") dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) cost = dp_table.get_optimal_cost() superreads, transmission_vector = dp_table.get_super_reads() assert len(set(transmission_vector)) == 1 partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0], cost, partition, readset, all_heterozygous, weights)