def test_genotyping_trio5(): reads = """ B 101 B 101 B 101 A 111 A 111 A 111 C 111 C 111 C 101 C 101 """ expected_genotypes = [ canonic_index_list_to_biallelic_gt_list([2, 2, 2]), canonic_index_list_to_biallelic_gt_list([2, 0, 2]), canonic_index_list_to_biallelic_gt_list([2, 1, 2]), ] numeric_sample_ids = NumericSampleIds() pedigree = Pedigree(numeric_sample_ids) pedigree.add_individual( "individual0", canonic_index_list_to_biallelic_gt_list([0, 0, 0]), [PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])] * 3, ) pedigree.add_individual( "individual1", canonic_index_list_to_biallelic_gt_list([0, 0, 0]), [PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])] * 3, ) pedigree.add_individual( "individual2", canonic_index_list_to_biallelic_gt_list([0, 0, 0]), [PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])] * 3, ) pedigree.add_relationship("individual0", "individual1", "individual2") recombcost = [2, 2, 2] genotype_pedigree(numeric_sample_ids, reads, recombcost, pedigree, expected_genotypes)
def verify(rs, all_heterozygous=False): positions = rs.get_positions() recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods(0, 0, 0) ] * len(positions) pedigree.add_individual('individual0', [1] * len(positions), genotype_likelihoods) # all genotypes heterozygous dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes=not all_heterozygous) verify_mec_score_and_partitioning(dp_table, rs)
def test_genotyping_trio14(): reads = """ A 111111 A 111111 B 111111 B 000000 C 000000 """ expected_genotypes = [ canonic_index_list_to_biallelic_gt_list([2, 2, 2, 2, 2, 2]), canonic_index_list_to_biallelic_gt_list([1, 1, 1, 1, 1, 1]), canonic_index_list_to_biallelic_gt_list([1, 1, 1, 1, 1, 1]), ] numeric_sample_ids = NumericSampleIds() pedigree = Pedigree(numeric_sample_ids) pedigree.add_individual( "individual0", canonic_index_list_to_biallelic_gt_list([0, 0, 0, 0, 0, 0]), [PhredGenotypeLikelihoods([1 / 3.0, 1 / 3.0, 1 / 3.0])] * 6, ) pedigree.add_individual( "individual1", canonic_index_list_to_biallelic_gt_list([0, 0, 0, 0, 0, 0]), [PhredGenotypeLikelihoods([1 / 3.0, 1 / 3.0, 1 / 3.0])] * 6, ) pedigree.add_individual( "individual2", canonic_index_list_to_biallelic_gt_list([0, 0, 0, 0, 0, 0]), [PhredGenotypeLikelihoods([1 / 3.0, 1 / 3.0, 1 / 3.0])] * 6, ) pedigree.add_relationship("individual0", "individual1", "individual2") recombcost = [1000000, 1000000, 1000000, 1000000, 1000000, 1000000] genotype_pedigree( numeric_sample_ids, reads, recombcost, pedigree, expected_genotypes, scaling=1000, )
def verify(rs, all_heterozygous=False): positions = rs.get_positions() # recombination costs 1, should not occur recombcost = [1] * len(positions) pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) # all genotypes heterozygous pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes=not all_heterozygous) verify_mec_score_and_partitioning(dp_table, rs)
def check_genotyping_single_individual(reads, weights = None, expected = None, genotypes = None, scaling = None, genotype_priors = None): # 0) set up read set readset = string_to_readset(s=reads, w=weights, scale_quality=scaling) positions = readset.get_positions() # 1) Genotype using forward backward algorithm recombcost = [1] * len(positions) numeric_sample_ids = NumericSampleIds() pedigree = Pedigree(numeric_sample_ids) genotype_likelihoods = [PhredGenotypeLikelihoods(1.0/3.0,1.0/3.0,1.0/3.0)] * len(positions) if genotype_priors != None: genotype_likelihoods = genotype_priors pedigree.add_individual('individual0', [1] * len(positions), genotype_likelihoods) dp_forward_backward = GenotypeDPTable(numeric_sample_ids, readset, recombcost,pedigree) # check the results compare_to_expected(dp_forward_backward, positions, expected, genotypes)
def phase_MAV(reads, n_alleles, all_het, genos, genotypes, weights=None): readset = string_to_readset(reads, n_alleles) positions = readset.get_positions() for all_heterozygous in all_het: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods(genos) ] * len(positions) pedigree.add_individual( 'individual0', genotypes, genotype_likelihoods) # all genotypes heterozygous dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) superreads_list, transmission_vector = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() return superreads_list, transmission_vector, cost
def create_pedigree( default_gq, distrust_genotypes, family, gl_regularizer, numeric_sample_ids, phasable_variant_table, trios, ): pedigree = Pedigree(numeric_sample_ids) for sample in family: # If distrusting genotypes, we pass genotype likelihoods on to pedigree object if distrust_genotypes: genotype_likelihoods = [] for gt, gl in zip( phasable_variant_table.genotypes_of(sample), phasable_variant_table.genotype_likelihoods_of(sample), ): assert gt.is_diploid_and_biallelic() if gl is None: # all genotypes get default_gq as genotype likelihood, exept the called genotype ... x = [default_gq] * 3 # ... which gets a 0 x[gt.get_index()] = 0 genotype_likelihoods.append(PhredGenotypeLikelihoods(x)) else: genotype_likelihoods.append( gl.as_phred(regularizer=gl_regularizer)) else: genotype_likelihoods = None pedigree.add_individual(sample, phasable_variant_table.genotypes_of(sample), genotype_likelihoods) for trio in trios: pedigree.add_relationship(father_id=trio.father, mother_id=trio.mother, child_id=trio.child) return pedigree
def test_geno_10(): reads = """ 001100 000000 000000 110011 110011 111111 """ genotypes = canonic_index_list_to_biallelic_gt_list([1, 1, 0, 0, 1, 1]) genotype_priors = [ PhredGenotypeLikelihoods([0.1, 0.8, 0.1]), PhredGenotypeLikelihoods([0.1, 0.8, 0.1]), PhredGenotypeLikelihoods([0.7, 0.2, 0.1]), PhredGenotypeLikelihoods([0.7, 0.2, 0.1]), PhredGenotypeLikelihoods([0.1, 0.8, 0.1]), PhredGenotypeLikelihoods([0.1, 0.8, 0.1]), ] check_genotyping_single_individual(reads, None, None, genotypes, 50, genotype_priors)
def test_geno_priors2(): reads = """ 11 01 """ prior_likelihoods = [ PhredGenotypeLikelihoods([0, 0.5, 0.5]), PhredGenotypeLikelihoods([0.25, 0.5, 0.25]), PhredGenotypeLikelihoods([0.1, 0.4, 0.5]), ] expected_likelihoods = [ PhredGenotypeLikelihoods([0.0, 0.35714285714285715, 0.6428571428571429]), PhredGenotypeLikelihoods([0.1323529411764706, 0.7352941176470589, 0.1323529411764706]), PhredGenotypeLikelihoods([0.015151515151515152, 0.30303030303030304, 0.6818181818181818]), ] check_genotyping_single_individual( reads, None, expected_likelihoods, None, 10, prior_likelihoods )
def check_phasing_single_individual(reads, algorithm="whatshap", weights=None): # 0) set up read set readset = string_to_readset(reads, weights) positions = readset.get_positions() # for hapchat if algorithm == "hapchat": dp_table = HapChatCore(readset) superreads = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0][0], cost, partition, readset, True, weights, algorithm) return # 1) Phase using PedMEC code for single individual for all_heterozygous in [False, True]: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for i in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) superreads, transmission_vector = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() # TODO: transmission vectors not returned properly, see issue 73 assert len(set(transmission_vector)) == 1 partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0], cost, partition, readset, all_heterozygous, weights) # 2) Phase using PedMEC code for trios with two "empty" individuals (i.e. having no reads) for all_heterozygous in [False, True]: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_individual( "individual1", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_individual( "individual2", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_relationship("individual0", "individual1", "individual2") dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) cost = dp_table.get_optimal_cost() superreads, transmission_vector = dp_table.get_super_reads() assert len(set(transmission_vector)) == 1 partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0], cost, partition, readset, all_heterozygous, weights)
def run_genotype( phase_input_files, variant_file, reference=None, output=sys.stdout, samples=None, chromosomes=None, ignore_read_groups=False, indels=True, mapping_quality=20, max_coverage=15, nopriors=False, ped=None, recombrate=1.26, genmap=None, gt_qual_threshold=0, prioroutput=None, constant=0.0, overhang=10, affine_gap=False, gap_start=10, gap_extend=7, mismatch=15, write_command_line_header=True, use_ped_samples=False, ): """ For now: this function only runs the genotyping algorithm. Genotype likelihoods for all variants are computed using the forward backward algorithm """ timers = StageTimer() logger.info( "This is WhatsHap (genotyping) %s running under Python %s", __version__, platform.python_version(), ) if write_command_line_header: command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) else: command_line = None with ExitStack() as stack: # read the given input files (BAMs, VCFs, ref...) numeric_sample_ids = NumericSampleIds() phased_input_reader = stack.enter_context( PhasedInputReader( phase_input_files, reference, numeric_sample_ids, ignore_read_groups, indels=indels, mapq_threshold=mapping_quality, overhang=overhang, affine=affine_gap, gap_start=gap_start, gap_extend=gap_extend, default_mismatch=mismatch, )) show_phase_vcfs = phased_input_reader.has_vcfs # vcf writer for final genotype likelihoods vcf_writer = stack.enter_context( GenotypeVcfWriter(command_line=command_line, in_path=variant_file, out_file=output)) # vcf writer for only the prior likelihoods (if output is desired) prior_vcf_writer = None if prioroutput is not None: prior_vcf_writer = stack.enter_context( GenotypeVcfWriter( command_line=command_line, in_path=variant_file, out_file=stack.enter_context(open(prioroutput, "w")), )) # parse vcf with input variants # remove all likelihoods that may already be present vcf_reader = stack.enter_context( VcfReader( variant_file, indels=indels, genotype_likelihoods=False, ignore_genotypes=True, )) if ignore_read_groups and not samples and len(vcf_reader.samples) > 1: raise CommandLineError( "When using --ignore-read-groups on a VCF with " "multiple samples, --sample must also be used.") if not samples: samples = vcf_reader.samples # if --use-ped-samples is set, use only samples from PED file if ped and use_ped_samples: samples = set() for trio in PedReader(ped): if trio.child is None or trio.mother is None or trio.father is None: continue samples.add(trio.mother) samples.add(trio.father) samples.add(trio.child) vcf_sample_set = set(vcf_reader.samples) for sample in samples: if sample not in vcf_sample_set: raise CommandLineError( "Sample {!r} requested on command-line not found in VCF". format(sample)) if ped and genmap: logger.info( "Using region-specific recombination rates from genetic map %s.", genmap, ) recombination_cost_computer = GeneticMapRecombinationCostComputer( genmap) else: if ped: logger.info("Using uniform recombination rate of %g cM/Mb.", recombrate) recombination_cost_computer = UniformRecombinationCostComputer( recombrate) samples = frozenset(samples) families, family_trios = setup_families(samples, ped, numeric_sample_ids, max_coverage) # Read phase information provided as VCF files, if provided. with timers("parse_phasing_vcfs"): phased_input_reader.read_vcfs() # compute genotype likelihood threshold gt_prob = 1.0 - (10**(-gt_qual_threshold / 10.0)) for variant_table in timers.iterate("parse_vcf", vcf_reader): # create a mapping of genome positions to indices var_to_pos = dict() for i in range(len(variant_table.variants)): var_to_pos[variant_table.variants[i].position] = i chromosome = variant_table.chromosome if (not chromosomes) or (chromosome in chromosomes): logger.info("======== Working on chromosome %r", chromosome) else: logger.info( "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)", chromosome, ) vcf_writer.write_genotypes(chromosome, variant_table, indels, leave_unchanged=True) if prioroutput is not None: prior_vcf_writer.write_genotypes(chromosome, variant_table, indels, leave_unchanged=True) continue positions = [v.position for v in variant_table.variants] if not nopriors: # compute prior genotype likelihoods based on all reads for sample in samples: logger.info("---- Initial genotyping of %s", sample) with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, variant_table.variants, sample, read_vcf=False, ) readset.sort() genotypes, genotype_likelihoods = compute_genotypes( readset, positions) # recompute genotypes based on given threshold reg_genotype_likelihoods = [] for gl in range(len(genotype_likelihoods)): norm_sum = (genotype_likelihoods[gl][0] + genotype_likelihoods[gl][1] + genotype_likelihoods[gl][2] + 3 * constant) regularized = PhredGenotypeLikelihoods([ (genotype_likelihoods[gl][0] + constant) / norm_sum, (genotype_likelihoods[gl][1] + constant) / norm_sum, (genotype_likelihoods[gl][2] + constant) / norm_sum, ]) genotypes[gl] = determine_genotype( regularized, gt_prob) assert isinstance(genotypes[gl], Genotype) reg_genotype_likelihoods.append(regularized) variant_table.set_genotype_likelihoods_of( sample, [ PhredGenotypeLikelihoods(list(gl)) for gl in reg_genotype_likelihoods ], ) variant_table.set_genotypes_of(sample, genotypes) else: # use uniform genotype likelihoods for all individuals for sample in samples: variant_table.set_genotype_likelihoods_of( sample, [PhredGenotypeLikelihoods([1 / 3, 1 / 3, 1 / 3])] * len(positions), ) # if desired, output the priors in separate vcf if prioroutput is not None: prior_vcf_writer.write_genotypes(chromosome, variant_table, indels) # Iterate over all families to process, i.e. a separate DP table is created # for each family. for representative_sample, family in sorted(families.items()): if len(family) == 1: logger.info("---- Processing individual %s", representative_sample) else: logger.info("---- Processing family with individuals: %s", ",".join(family)) max_coverage_per_sample = max(1, max_coverage // len(family)) logger.info("Using maximum coverage per sample of %dX", max_coverage_per_sample) trios = family_trios[representative_sample] assert (len(family) == 1) or (len(trios) > 0) # Get the reads belonging to each sample readsets = dict() for sample in family: with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, variant_table.variants, sample, ) with timers("select"): readset = readset.subset([ i for i, read in enumerate(readset) if len(read) >= 2 ]) logger.info( "Kept %d reads that cover at least two variants each", len(readset), ) selected_reads = select_reads( readset, max_coverage_per_sample, preferred_source_ids=vcf_source_ids, ) readsets[sample] = selected_reads # Merge reads into one ReadSet (note that each Read object # knows the sample it originated from). all_reads = ReadSet() for sample, readset in readsets.items(): for read in readset: assert read.is_sorted(), "Add a read.sort() here" all_reads.add(read) all_reads.sort() # Determine which variants can (in principle) be phased accessible_positions = sorted(all_reads.get_positions()) logger.info( "Variants covered by at least one phase-informative " "read in at least one individual after read selection: %d", len(accessible_positions), ) # Create Pedigree pedigree = Pedigree(numeric_sample_ids) for sample in family: # genotypes are assumed to be unknown, so ignore information that # might already be present in the input vcf all_genotype_likelihoods = variant_table.genotype_likelihoods_of( sample) genotype_l = [ all_genotype_likelihoods[var_to_pos[a_p]] for a_p in accessible_positions ] pedigree.add_individual( sample, [ Genotype([]) for i in range(len(accessible_positions)) ], genotype_l, ) for trio in trios: pedigree.add_relationship( father_id=trio.father, mother_id=trio.mother, child_id=trio.child, ) recombination_costs = recombination_cost_computer.compute( accessible_positions) # Finally, run genotyping algorithm with timers("genotyping"): problem_name = "genotyping" logger.info( "Genotype %d sample%s by solving the %s problem ...", len(family), "s" if len(family) > 1 else "", problem_name, ) forward_backward_table = GenotypeDPTable( numeric_sample_ids, all_reads, recombination_costs, pedigree, accessible_positions, ) # store results for s in family: likelihood_list = variant_table.genotype_likelihoods_of( s) genotypes_list = variant_table.genotypes_of(s) for pos in range(len(accessible_positions)): likelihoods = forward_backward_table.get_genotype_likelihoods( s, pos) # compute genotypes from likelihoods and store information geno = determine_genotype(likelihoods, gt_prob) assert isinstance(geno, Genotype) genotypes_list[var_to_pos[ accessible_positions[pos]]] = geno likelihood_list[var_to_pos[ accessible_positions[pos]]] = likelihoods variant_table.set_genotypes_of(s, genotypes_list) variant_table.set_genotype_likelihoods_of( s, likelihood_list) with timers("write_vcf"): logger.info("======== Writing VCF") vcf_writer.write_genotypes(chromosome, variant_table, indels) logger.info("Done writing VCF") logger.debug("Chromosome %r finished", chromosome) logger.info("\n== SUMMARY ==") total_time = timers.total() log_memory_usage() logger.info( "Time spent reading BAM: %6.1f s", timers.elapsed("read_bam"), ) logger.info( "Time spent parsing VCF: %6.1f s", timers.elapsed("parse_vcf"), ) if show_phase_vcfs: logger.info( "Time spent parsing input phasings from VCFs: %6.1f s", timers.elapsed("parse_phasing_vcfs"), ) logger.info("Time spent selecting reads: %6.1f s", timers.elapsed("select")) logger.info( "Time spent genotyping: %6.1f s", timers.elapsed("genotyping"), ) logger.info( "Time spent writing VCF: %6.1f s", timers.elapsed("write_vcf"), ) logger.info( "Time spent on rest: %6.1f s", total_time - timers.sum(), ) logger.info("Total elapsed time: %6.1f s", total_time)
def test_genotype_likelihoods(): assert list(PhredGenotypeLikelihoods()) == [0, 0, 0] assert list(PhredGenotypeLikelihoods(7, 1, 12)) == [7, 1, 12] gl = GenotypeLikelihoods(*(math.log10(x) for x in [1e-10, 0.5, 0.002])) assert list(gl.as_phred()) == [97, 0, 24] assert list(gl.as_phred(regularizer=0.01)) == [20, 0, 19]