def bipartition(reads): positions = reads.get_positions() # create genotypes over your variants: all heterozygous (=1) genotypes = canonic_index_list_to_biallelic_gt_list([1] * len(positions)) # genotype likelihoods are None genotype_likelihoods = [None] * len(positions) # create empty pedigree pedigree = Pedigree(NumericSampleIds()) # add one individual to pedigree pedigree.add_individual('individual0', genotypes, genotype_likelihoods) # recombination cost vector, irrelevant if one using one individual recombcost = [1] * len(positions) # run the core phasing algorithm, creating a DP table dp_table = PedigreeDPTable(reads, recombcost, pedigree, distrust_genotypes=False) phasing, transmission_vector = dp_table.get_super_reads() #print('PHASING') #print(phasing[0]) #print(phasing[0][0]) #print(phasing[0][1]) mec_score = dp_table.get_optimal_cost() eprint("MEC Score:", mec_score) eprint("MEC Score / readset length:", float(mec_score) / float(readset_length)) # In case the bi-partition of reads is of interest: partition = dp_table.get_optimal_partitioning() #print(partition) eprint("partition fraction:", sum(partition) / float(len(partition))) return phasing, partition
def test_phase_empty_trio(): rs = ReadSet() recombcost = [] pedigree = Pedigree(NumericSampleIds()) pedigree.add_individual('individual0', []) pedigree.add_individual('individual1', []) pedigree.add_individual('individual2', []) pedigree.add_relationship('individual0', 'individual1', 'individual2') dp_table = PedigreeDPTable(rs, recombcost, pedigree) (superreadsm, superreadsf, superreadsc), transmission_vector = dp_table.get_super_reads()
def phase_MAV(reads, n_alleles, all_het, genos, genotypes, weights=None): readset = string_to_readset(reads, n_alleles) positions = readset.get_positions() for all_heterozygous in all_het: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods(genos) ] * len(positions) pedigree.add_individual( 'individual0', genotypes, genotype_likelihoods) # all genotypes heterozygous dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) superreads_list, transmission_vector = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() return superreads_list, transmission_vector, cost
def test_phase_empty_readset(algorithm): rs = ReadSet() recombcost = [1, 1] genotypes = canonic_index_list_to_biallelic_gt_list([1, 1]) pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [None, None] pedigree.add_individual("individual0", genotypes, genotype_likelihoods) if algorithm == "hapchat": dp_table = HapChatCore(rs) else: dp_table = PedigreeDPTable(rs, recombcost, pedigree) _ = dp_table.get_super_reads()
def verify(rs, all_heterozygous=False): positions = rs.get_positions() recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods(0, 0, 0) ] * len(positions) pedigree.add_individual('individual0', [1] * len(positions), genotype_likelihoods) # all genotypes heterozygous dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes=not all_heterozygous) verify_mec_score_and_partitioning(dp_table, rs)
def test_phase_empty_readset(algorithm): rs = ReadSet() recombcost = [1, 1] genotypes = [1, 1] pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [None, None] pedigree.add_individual('individual0', genotypes, genotype_likelihoods) dp_table = None if algorithm == 'hapchat': dp_table = HapChatCore(rs) else: dp_table = PedigreeDPTable(rs, recombcost, pedigree) superreads = dp_table.get_super_reads()
def verify(rs, all_heterozygous=False): positions = rs.get_positions() # recombination costs 1, should not occur recombcost = [1] * len(positions) pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) # all genotypes heterozygous pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes=not all_heterozygous) verify_mec_score_and_partitioning(dp_table, rs)
def phase_pedigree(reads, recombcost, pedigree, distrust_genotypes=False, positions=None): rs = string_to_readset_pedigree(reads) dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes, positions) superreads_list, transmission_vector = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() for superreads in superreads_list: for sr in superreads: print(sr) print('Cost:', dp_table.get_optimal_cost()) print('Transmission vector:', transmission_vector) print('Partition:', dp_table.get_optimal_partitioning()) return superreads_list, transmission_vector, cost
def run_whatshap( phase_input_files: List[str], variant_file: str, reference: Union[None, bool, str] = False, output: TextIO = sys.stdout, samples: List[str] = None, chromosomes: Optional[List[str]] = None, ignore_read_groups: bool = False, indels: bool = True, mapping_quality: int = 20, read_merging: bool = False, read_merging_error_rate: float = 0.15, read_merging_max_error_rate: float = 0.25, read_merging_positive_threshold: int = 1000000, read_merging_negative_threshold: int = 1000, max_coverage: int = 15, distrust_genotypes: bool = False, include_homozygous: bool = False, ped: Optional[str] = None, recombrate: float = 1.26, genmap: Optional[str] = None, genetic_haplotyping: bool = True, recombination_list_filename: Optional[str] = None, tag: str = "PS", read_list_filename: Optional[str] = None, gl_regularizer: Optional[float] = None, gtchange_list_filename: Optional[str] = None, default_gq: int = 30, write_command_line_header: bool = True, use_ped_samples: bool = False, algorithm: str = "whatshap", ): """ Run WhatsHap. phase_input_files -- list of paths to BAM/CRAM/VCF files variant_file -- path to input VCF reference -- path to reference FASTA. If False: skip realignment. If None: complain if reference needed. output -- path to output VCF or a file-like object samples -- names of samples to phase. an empty list means: phase all samples chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes ignore_read_groups mapping_quality -- discard reads below this mapping quality read_merging -- whether or not to merge reads read_merging_error_rate -- probability that a nucleotide is wrong read_merging_max_error_rate -- max error rate on edge of merge graph considered read_merging_positive_threshold -- threshold on the ratio of the two probabilities read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold max_coverage distrust_genotypes include_homozygous genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status recombination_list_filename -- filename to write putative recombination events to tag -- How to store phasing info in the VCF, can be 'PS' or 'HP' read_list_filename -- name of file to write list of used reads to algorithm -- algorithm to use, can be 'whatshap' or 'hapchat' gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred gtchange_list_filename -- filename to write list of changed genotypes to default_gq -- genotype likelihood to be used when GL or PL not available write_command_line_header -- whether to add a ##commandline header to the output VCF """ if algorithm == "hapchat" and ped is not None: raise CommandLineError( "The hapchat algorithm cannot do pedigree phasing") timers = StageTimer() logger.info( f"This is WhatsHap {__version__} running under Python {platform.python_version()}" ) numeric_sample_ids = NumericSampleIds() command_line: Optional[str] if write_command_line_header: command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) else: command_line = None read_merger: ReadMergerBase if read_merging: read_merger = ReadMerger( read_merging_error_rate, read_merging_max_error_rate, read_merging_positive_threshold, read_merging_negative_threshold, ) else: read_merger = DoNothingReadMerger() with ExitStack() as stack: try: vcf_writer = stack.enter_context( PhasedVcfWriter( command_line=command_line, in_path=variant_file, out_file=output, tag=tag, indels=indels, )) except (OSError, VcfError) as e: raise CommandLineError(e) phased_input_reader = stack.enter_context( PhasedInputReader( phase_input_files, None if reference is False else reference, numeric_sample_ids, ignore_read_groups, mapq_threshold=mapping_quality, indels=indels, )) show_phase_vcfs = phased_input_reader.has_vcfs if phased_input_reader.has_alignments and reference is None: raise CommandLineError( "A reference FASTA needs to be provided with -r/--reference; " "or use --no-reference at the expense of phasing quality.") # Only read genotype likelihoods from VCFs when distrusting genotypes vcf_reader = stack.enter_context( VcfReader(variant_file, indels=indels, genotype_likelihoods=distrust_genotypes)) if ignore_read_groups and not samples and len(vcf_reader.samples) > 1: raise CommandLineError( "When using --ignore-read-groups on a VCF with " "multiple samples, --sample must also be used.") if not samples: samples = vcf_reader.samples # if --use-ped-samples is set, use only samples from PED file if ped and use_ped_samples: samples = PedReader(ped).samples() raise_if_any_sample_not_in_vcf(vcf_reader, samples) recombination_cost_computer = make_recombination_cost_computer( ped, genmap, recombrate) families, family_trios = setup_families(samples, ped, max_coverage) del samples for trios in family_trios.values(): for trio in trios: # Ensure that all mentioned individuals have a numeric id _ = numeric_sample_ids[trio.child] read_list = None if read_list_filename: read_list = stack.enter_context(ReadList(read_list_filename)) if algorithm == "hapchat": logger.warning( "On which haplotype a read occurs in the inferred solution is not yet " "implemented in hapchat, and so the corresponding column in the " "read list file contains no information about this") with timers("parse_phasing_vcfs"): # TODO should this be done in PhasedInputReader.__init__? phased_input_reader.read_vcfs() superreads: Dict[str, ReadSet] components: Dict for variant_table in timers.iterate("parse_vcf", vcf_reader): chromosome = variant_table.chromosome if (not chromosomes) or (chromosome in chromosomes): logger.info("======== Working on chromosome %r", chromosome) else: logger.info( "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)", chromosome, ) with timers("write_vcf"): superreads, components = dict(), dict() vcf_writer.write(chromosome, superreads, components) continue # These two variables hold the phasing results for all samples superreads, components = dict(), dict() # Iterate over all families to process, i.e. a separate DP table is created # for each family. # TODO: Can the body of this loop be factored out into a phase_family function? for representative_sample, family in sorted(families.items()): if len(family) == 1: logger.info("---- Processing individual %s", representative_sample) else: logger.info("---- Processing family with individuals: %s", ",".join(family)) max_coverage_per_sample = max(1, max_coverage // len(family)) logger.info("Using maximum coverage per sample of %dX", max_coverage_per_sample) trios = family_trios[representative_sample] assert len(family) == 1 or len(trios) > 0 homozygous_positions, phasable_variant_table = find_phaseable_variants( family, include_homozygous, trios, variant_table) # Get the reads belonging to each sample readsets = dict() # TODO this could become a list for sample in family: with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, phasable_variant_table.variants, sample) # TODO: Read selection done w.r.t. all variants, where using heterozygous # variants only would probably give better results. with timers("select"): readset = readset.subset([ i for i, read in enumerate(readset) if len(read) >= 2 ]) logger.info( "Kept %d reads that cover at least two variants each", len(readset)) merged_reads = read_merger.merge(readset) selected_reads = select_reads( merged_reads, max_coverage_per_sample, preferred_source_ids=vcf_source_ids, ) readsets[sample] = selected_reads if len(family) == 1 and not distrust_genotypes: # When having a pedigree (len(family) > 1), blocks are also merged after # phasing based on the pedigree information and these statistics are not # so useful. When distrust_genotypes, genotypes can change during phasing # and so can the block structure. So don't print these stats in those cases log_best_case_phasing_info(readset, selected_reads) all_reads = merge_readsets(readsets) # Determine which variants can (in principle) be phased accessible_positions = sorted(all_reads.get_positions()) logger.info( "Variants covered by at least one phase-informative " "read in at least one individual after read selection: %d", len(accessible_positions), ) if len(family) > 1 and genetic_haplotyping: # In case of genetic haplotyping, also retain all positions homozygous # in at least one individual (because they might be phased based on genotypes) accessible_positions = sorted( set(accessible_positions).union(homozygous_positions)) logger.info( "Variants either covered by phase-informative read or homozygous " "in at least one individual: %d", len(accessible_positions), ) # Keep only accessible positions phasable_variant_table.subset_rows_by_position( accessible_positions) assert len(phasable_variant_table.variants) == len( accessible_positions) pedigree = create_pedigree( default_gq, distrust_genotypes, family, gl_regularizer, numeric_sample_ids, phasable_variant_table, trios, ) recombination_costs = recombination_cost_computer.compute( accessible_positions) # Finally, run phasing algorithm with timers("phase"): problem_name = "MEC" if len(family) == 1 else "PedMEC" logger.info( "Phasing %d sample%s by solving the %s problem ...", len(family), plural_s(len(family)), problem_name, ) dp_table: Union[HapChatCore, PedigreeDPTable] if algorithm == "hapchat": dp_table = HapChatCore(all_reads) else: dp_table = PedigreeDPTable( all_reads, recombination_costs, pedigree, distrust_genotypes, accessible_positions, ) superreads_list, transmission_vector = dp_table.get_super_reads( ) logger.info("%s cost: %d", problem_name, dp_table.get_optimal_cost()) with timers("components"): overall_components = compute_overall_components( accessible_positions, all_reads, distrust_genotypes, family, genetic_haplotyping, homozygous_positions, numeric_sample_ids, superreads_list, ) log_component_stats(overall_components, len(accessible_positions)) if recombination_list_filename: n_recombinations = write_recombination_list( recombination_list_filename, chromosome, accessible_positions, overall_components, recombination_costs, transmission_vector, trios, ) logger.info( "Total no. of detected recombination events: %d", n_recombinations) # Superreads in superreads_list are in the same order as individuals were added to the pedigree for sample, sample_superreads in zip(family, superreads_list): superreads[sample] = sample_superreads assert len(sample_superreads) == 2 assert (sample_superreads[0].sample_id == sample_superreads[1].sample_id == numeric_sample_ids[sample]) # identical for all samples components[sample] = overall_components if read_list: read_list.write( all_reads, dp_table.get_optimal_partitioning(), components, numeric_sample_ids, ) with timers("write_vcf"): logger.info("======== Writing VCF") changed_genotypes = vcf_writer.write(chromosome, superreads, components) logger.info("Done writing VCF") if changed_genotypes: assert distrust_genotypes logger.info("Changed %d genotypes while writing VCF", len(changed_genotypes)) if gtchange_list_filename: logger.info("Writing list of changed genotypes to %r", gtchange_list_filename) write_changed_genotypes(gtchange_list_filename, changed_genotypes) logger.debug("Chromosome %r finished", chromosome) log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)
def check_phasing_single_individual(reads, algorithm="whatshap", weights=None): # 0) set up read set readset = string_to_readset(reads, weights) positions = readset.get_positions() # for hapchat if algorithm == "hapchat": dp_table = HapChatCore(readset) superreads = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0][0], cost, partition, readset, True, weights, algorithm) return # 1) Phase using PedMEC code for single individual for all_heterozygous in [False, True]: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for i in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) superreads, transmission_vector = dp_table.get_super_reads() cost = dp_table.get_optimal_cost() # TODO: transmission vectors not returned properly, see issue 73 assert len(set(transmission_vector)) == 1 partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0], cost, partition, readset, all_heterozygous, weights) # 2) Phase using PedMEC code for trios with two "empty" individuals (i.e. having no reads) for all_heterozygous in [False, True]: recombcost = [1] * len( positions) # recombination costs 1, should not occur pedigree = Pedigree(NumericSampleIds()) genotype_likelihoods = [ None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0]) ] * len(positions) pedigree.add_individual( "individual0", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_individual( "individual1", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_individual( "individual2", [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))], genotype_likelihoods, ) # all genotypes heterozygous pedigree.add_relationship("individual0", "individual1", "individual2") dp_table = PedigreeDPTable(readset, recombcost, pedigree, distrust_genotypes=not all_heterozygous) cost = dp_table.get_optimal_cost() superreads, transmission_vector = dp_table.get_super_reads() assert len(set(transmission_vector)) == 1 partition = dp_table.get_optimal_partitioning() compare_phasing_brute_force(superreads[0], cost, partition, readset, all_heterozygous, weights)
def run_whatshap( phase_input_files, variant_file, reference=None, output=sys.stdout, samples=None, chromosomes=None, ignore_read_groups=False, indels=True, mapping_quality=20, read_merging=False, read_merging_error_rate=0.15, read_merging_max_error_rate=0.25, read_merging_positive_threshold=1000000, read_merging_negative_threshold=1000, max_coverage=15, full_genotyping=False, distrust_genotypes=False, include_homozygous=False, ped=None, recombrate=1.26, genmap=None, genetic_haplotyping=True, recombination_list_filename=None, tag="PS", read_list_filename=None, gl_regularizer=None, gtchange_list_filename=None, default_gq=30, write_command_line_header=True, use_ped_samples=False, algorithm="whatshap", ): """ Run WhatsHap. phase_input_files -- list of paths to BAM/CRAM/VCF files variant_file -- path to input VCF reference -- path to reference FASTA output -- path to output VCF or a file-like object samples -- names of samples to phase. an empty list means: phase all samples chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes ignore_read_groups mapping_quality -- discard reads below this mapping quality read_merging -- whether or not to merge reads read_merging_error_rate -- probability that a nucleotide is wrong read_merging_max_error_rate -- max error rate on edge of merge graph considered read_merging_positive_threshold -- threshold on the ratio of the two probabilities read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold max_coverage full_genotyping distrust_genotypes include_homozygous genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status recombination_list_filename -- filename to write putative recombination events to tag -- How to store phasing info in the VCF, can be 'PS' or 'HP' read_list_filename -- name of file to write list of used reads to algorithm -- algorithm to use, can be 'whatshap' or 'hapchat' gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred gtchange_list_filename -- filename to write list of changed genotypes to default_gq -- genotype likelihood to be used when GL or PL not available write_command_line_header -- whether to add a ##commandline header to the output VCF """ if algorithm == "hapchat" and ped is not None: raise CommandLineError( "The hapchat algorithm cannot do pedigree phasing") timers = StageTimer() logger.info( "This is WhatsHap %s running under Python %s", __version__, platform.python_version(), ) if full_genotyping: distrust_genotypes = True include_homozygous = True numeric_sample_ids = NumericSampleIds() if write_command_line_header: command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) else: command_line = None if read_merging: read_merger = ReadMerger( read_merging_error_rate, read_merging_max_error_rate, read_merging_positive_threshold, read_merging_negative_threshold, ) else: read_merger = DoNothingReadMerger() with ExitStack() as stack: try: vcf_writer = stack.enter_context( PhasedVcfWriter( command_line=command_line, in_path=variant_file, out_file=output, tag=tag, )) except (OSError, VcfError) as e: raise CommandLineError(e) phased_input_reader = stack.enter_context( PhasedInputReader( phase_input_files, reference, numeric_sample_ids, ignore_read_groups, mapq_threshold=mapping_quality, indels=indels, )) show_phase_vcfs = phased_input_reader.has_vcfs # Only read genotype likelihoods from VCFs when distrusting genotypes vcf_reader = stack.enter_context( VcfReader(variant_file, indels=indels, genotype_likelihoods=distrust_genotypes)) if ignore_read_groups and not samples and len(vcf_reader.samples) > 1: raise CommandLineError( "When using --ignore-read-groups on a VCF with " "multiple samples, --sample must also be used.") if not samples: samples = vcf_reader.samples # if --use-ped-samples is set, use only samples from PED file if ped and use_ped_samples: samples = PedReader(ped).samples() raise_if_any_sample_not_in_vcf(vcf_reader, samples) if ped and genmap: logger.info( "Using region-specific recombination rates from genetic map %s.", genmap, ) try: recombination_cost_computer = GeneticMapRecombinationCostComputer( genmap) except ParseError as e: raise CommandLineError(e) else: if ped: logger.info("Using uniform recombination rate of %g cM/Mb.", recombrate) recombination_cost_computer = UniformRecombinationCostComputer( recombrate) samples = frozenset(samples) families, family_trios = setup_families(samples, ped, numeric_sample_ids, max_coverage) read_list = None if read_list_filename: read_list = stack.enter_context(ReadList(read_list_filename)) if algorithm == "hapchat": logger.warning( "On which haplotype a read occurs in the inferred solution is not yet " "implemented in hapchat, and so the corresponding column in the " "read list file contains no information about this") with timers("parse_phasing_vcfs"): # TODO should this be done in PhasedInputReader.__init__? phased_input_reader.read_vcfs() for variant_table in timers.iterate("parse_vcf", vcf_reader): chromosome = variant_table.chromosome if (not chromosomes) or (chromosome in chromosomes): logger.info("======== Working on chromosome %r", chromosome) else: logger.info( "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)", chromosome, ) with timers("write_vcf"): superreads, components = dict(), dict() vcf_writer.write(chromosome, superreads, components) continue if full_genotyping: positions = [v.position for v in variant_table.variants] for sample in samples: logger.info("---- Initial genotyping of %s", sample) with timers("read_bam"): bam_sample = None if ignore_read_groups else sample readset, vcf_source_ids = phased_input_reader.read( chromosome, variant_table.variants, bam_sample, read_vcf=False, ) readset.sort() # TODO can be removed genotypes, genotype_likelihoods = compute_genotypes( readset, positions) variant_table.set_genotypes_of(sample, genotypes) variant_table.set_genotype_likelihoods_of( sample, [ GenotypeLikelihoods(gl) for gl in genotype_likelihoods ], ) # These two variables hold the phasing results for all samples superreads, components = dict(), dict() # Iterate over all families to process, i.e. a separate DP table is created # for each family. # TODO: Can the body of this loop be factored out into a phase_family function? for representative_sample, family in sorted(families.items()): if len(family) == 1: logger.info("---- Processing individual %s", representative_sample) else: logger.info("---- Processing family with individuals: %s", ",".join(family)) max_coverage_per_sample = max(1, max_coverage // len(family)) logger.info("Using maximum coverage per sample of %dX", max_coverage_per_sample) trios = family_trios[representative_sample] assert len(family) == 1 or len(trios) > 0 homozygous_positions, phasable_variant_table = find_phaseable_variants( family, include_homozygous, trios, variant_table) # Get the reads belonging to each sample readsets = dict() # TODO this could become a list for sample in family: with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, phasable_variant_table.variants, sample, ) # TODO: Read selection done w.r.t. all variants, where using heterozygous # variants only would probably give better results. with timers("select"): readset = readset.subset([ i for i, read in enumerate(readset) if len(read) >= 2 ]) logger.info( "Kept %d reads that cover at least two variants each", len(readset), ) merged_reads = read_merger.merge(readset) selected_reads = select_reads( merged_reads, max_coverage_per_sample, preferred_source_ids=vcf_source_ids, ) readsets[sample] = selected_reads if len(family) == 1 and not distrust_genotypes: # When having a pedigree (len(family) > 1), blocks are also merged after # phasing based on the pedigree information and these statistics are not # so useful. When distrust_genotypes, genotypes can change during phasing # and so can the block structure. So don't print these stats in those cases log_best_case_phasing_info(readset, selected_reads) all_reads = merge_readsets(readsets) # Determine which variants can (in principle) be phased accessible_positions = sorted(all_reads.get_positions()) logger.info( "Variants covered by at least one phase-informative " "read in at least one individual after read selection: %d", len(accessible_positions), ) if len(family) > 1 and genetic_haplotyping: # In case of genetic haplotyping, also retain all positions homozygous # in at least one individual (because they might be phased based on genotypes) accessible_positions = sorted( set(accessible_positions).union(homozygous_positions)) logger.info( "Variants either covered by phase-informative read or homozygous " "in at least one individual: %d", len(accessible_positions), ) # Keep only accessible positions phasable_variant_table.subset_rows_by_position( accessible_positions) assert len(phasable_variant_table.variants) == len( accessible_positions) pedigree = create_pedigree( default_gq, distrust_genotypes, family, gl_regularizer, numeric_sample_ids, phasable_variant_table, trios, ) recombination_costs = recombination_cost_computer.compute( accessible_positions) # Finally, run phasing algorithm with timers("phase"): problem_name = "MEC" if len(family) == 1 else "PedMEC" logger.info( "Phasing %d sample%s by solving the %s problem ...", len(family), plural_s(len(family)), problem_name, ) if algorithm == "hapchat": dp_table = HapChatCore(all_reads) else: dp_table = PedigreeDPTable( all_reads, recombination_costs, pedigree, distrust_genotypes, accessible_positions, ) superreads_list, transmission_vector = dp_table.get_super_reads( ) optimal_cost = dp_table.get_optimal_cost() logger.info("%s cost: %d", problem_name, optimal_cost) with timers("components"): master_block = None heterozygous_positions_by_sample = None # If we distrusted genotypes, we need to re-determine which sites are h**o-/heterozygous after phasing if distrust_genotypes: hom_in_any_sample = set() heterozygous_positions_by_sample = {} heterozygous_gts = frozenset({(0, 1), (1, 0)}) homozygous_gts = frozenset({(0, 0), (1, 1)}) for sample, sample_superreads in zip( family, superreads_list): hets = set() for v1, v2 in zip(*sample_superreads): assert v1.position == v2.position if v1.position not in accessible_positions: continue gt = (v1.allele, v2.allele) if gt in heterozygous_gts: hets.add(v1.position) elif gt in homozygous_gts: hom_in_any_sample.add(v1.position) heterozygous_positions_by_sample[ numeric_sample_ids[sample]] = hets if len(family) > 1 and genetic_haplotyping: master_block = sorted(hom_in_any_sample) else: if len(family) > 1 and genetic_haplotyping: master_block = sorted( set(homozygous_positions).intersection( set(accessible_positions))) overall_components = find_components( accessible_positions, all_reads, master_block, heterozygous_positions_by_sample, ) n_phased_blocks = len(set(overall_components.values())) logger.info("No. of phased blocks: %d", n_phased_blocks) largest_component = find_largest_component( overall_components) if len(largest_component) > 0: logger.info( "Largest component contains %d variants (%.1f%% of accessible variants) between position %d and %d", len(largest_component), len(largest_component) * 100.0 / len(accessible_positions), largest_component[0] + 1, largest_component[-1] + 1, ) if recombination_list_filename: n_recombinations = write_recombination_list( recombination_list_filename, chromosome, accessible_positions, overall_components, recombination_costs, transmission_vector, trios, ) logger.info( "Total no. of detected recombination events: %d", n_recombinations, ) # Superreads in superreads_list are in the same order as individuals were added to the pedigree for sample, sample_superreads in zip(family, superreads_list): superreads[sample] = sample_superreads assert len(sample_superreads) == 2 assert (sample_superreads[0].sample_id == sample_superreads[1].sample_id == numeric_sample_ids[sample]) # identical for all samples components[sample] = overall_components if read_list: read_list.write( all_reads, dp_table.get_optimal_partitioning(), components, numeric_sample_ids, ) with timers("write_vcf"): logger.info("======== Writing VCF") changed_genotypes = vcf_writer.write(chromosome, superreads, components) logger.info("Done writing VCF") if changed_genotypes: assert distrust_genotypes logger.info("Changed %d genotypes while writing VCF", len(changed_genotypes)) if gtchange_list_filename: logger.info("Writing list of changed genotypes to %r", gtchange_list_filename) write_changed_genotypes(gtchange_list_filename, changed_genotypes) logger.debug("Chromosome %r finished", chromosome) log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)