Exemple #1
0
def get_sample_to_work_on(vcf_readers: List[VcfReader], requested_sample: Optional[str]):
    all_samples = set()
    sample_intersection = None
    for vcf_reader in vcf_readers:
        if sample_intersection is None:
            sample_intersection = set(vcf_reader.samples)
        else:
            sample_intersection.intersection_update(vcf_reader.samples)
        all_samples.update(vcf_reader.samples)
    assert sample_intersection is not None
    if requested_sample:
        sample_intersection.intersection_update([requested_sample])
        if len(sample_intersection) == 0:
            raise CommandLineError(
                "Sample {!r} requested on command-line not found in all VCFs".format(
                    requested_sample
                )
            )
        requested_sample = requested_sample
    else:
        if len(sample_intersection) == 0:
            raise CommandLineError("None of the samples is present in all VCFs")
        elif len(sample_intersection) == 1:
            requested_sample = list(sample_intersection)[0]
        else:
            raise CommandLineError(
                "More than one sample is present in all VCFs, please use"
                " --sample to specify which sample to work on."
            )
    return requested_sample
Exemple #2
0
def raise_if_any_sample_not_in_vcf(vcf_reader, samples):
    vcf_sample_set = set(vcf_reader.samples)
    for sample in samples:
        if sample not in vcf_sample_set:
            raise CommandLineError(
                "Sample {!r} requested on command-line not found in VCF".
                format(sample))
Exemple #3
0
def run_hapcut2vcf(hapcut, vcf, output=sys.stdout):
    command_line = "(whatshap {}) {}".format(__version__,
                                             " ".join(sys.argv[1:]))
    with ExitStack() as stack:
        if isinstance(output, str):
            output = stack.enter_context(open(output, "w"))

        writer = PhasedVcfWriter(vcf, command_line, out_file=output)
        if len(writer.samples) > 1:
            # This would be easy to support with a --sample command-line parameter,
            # but hapCUT does not seem to support multi-sample VCFs, so something
            # must be wrong anyway.
            raise CommandLineError("There is more than one sample in this VCF")
        sample = writer.samples[0]

        f = stack.enter_context(open(hapcut))
        parser = HapCutParser(f)
        for chromosome, blocks in parser:
            logger.info("Read %d phased blocks for chromosome %s", len(blocks),
                        chromosome)

            # Build one read for each haplotype and the connected components
            haplotypes = [Read(str(i)) for i in (1, 2)]
            components = dict()
            for block in blocks:
                for variant in block:
                    haplotypes[0].add_variant(variant.position,
                                              variant.haplotype1, 0)
                    haplotypes[1].add_variant(variant.position,
                                              variant.haplotype2, 0)
                    components[variant.position] = variant.component_id

            sample_superreads = {sample: haplotypes}
            sample_components = {sample: components}
            writer.write(chromosome, sample_superreads, sample_components)
Exemple #4
0
def open_haplotag_writer(path):
    if path is None:
        path = os.devnull
    try:
        writer = xopen(path, "wt")
    except OSError as err:
        raise CommandLineError(
            "Error while initializing haplotag list output at path: {}\n{}".format(path, err)
        )
    logger.debug("Writing header line to haplotag list output file")
    print("#readname", "haplotype", "phaseset", "chromosome", sep="\t", file=writer)
    return writer
Exemple #5
0
def get_variant_tables(
    vcf_readers: List[VcfReader], vcf_filenames: List[str]
) -> List[Dict[str, VariantTable]]:
    vcfs = []
    for reader, filename in zip(vcf_readers, vcf_filenames):
        # create dict mapping chromosome names to VariantTables
        m = dict()
        logger.info("Reading phasing from %r", filename)
        try:
            for variant_table in reader:
                m[variant_table.chromosome] = variant_table
        except PloidyError as e:
            raise CommandLineError("Provided ploidy is invalid: {}. Aborting.".format(e))
        vcfs.append(m)
    return vcfs
Exemple #6
0
def make_recombination_cost_computer(
        ped: Optional[str], genmap: Optional[str],
        recombrate: float) -> RecombinationCostComputer:
    if ped and genmap:
        logger.info(
            "Using region-specific recombination rates from genetic map %s.",
            genmap)
        try:
            return GeneticMapRecombinationCostComputer(genmap)
        except ParseError as e:
            raise CommandLineError(e)
    else:
        if ped:
            logger.info("Using uniform recombination rate of %g cM/Mb.",
                        recombrate)
        return UniformRecombinationCostComputer(recombrate)
Exemple #7
0
def open_output_alignment_file(aln_output, reference, vcf_md5, bam_header):
    """
    :param aln_output:
    :param reference:
    :param vcf_md5:
    :param bam_header:
    :param exit_stack:
    :return:
    """
    # Prepare header
    # TODO: convince pysam to allow @HS header line
    command_line = " ".join(["whatshap"] + sys.argv[1:])
    PG_entry = {
        "ID": "whatshap",
        "PN": "whatshap",
        "VN": __version__,
        "CL": command_line,
        "m5": vcf_md5,
    }
    if "PG" in bam_header:
        bam_header["PG"].append(PG_entry)
    else:
        bam_header["PG"] = [PG_entry]
    if aln_output is None:
        aln_output = "-"
        kwargs = dict()
    elif str(aln_output).endswith(".cram"):  # FIXME hard-coded value
        if reference is None:
            raise ValueError(
                'Writing CRAM output requires FASTA reference file given via "--reference"'
            )
        kwargs = dict(mode="wc", reference_filename=reference)
    else:
        # Write BAM
        kwargs = dict(mode="wb")
    try:
        bam_writer = pysam.AlignmentFile(
            aln_output,
            header=pysam.AlignmentHeader.from_dict(bam_header),
            **kwargs)
    except OSError as err:
        raise CommandLineError(
            "Error while initializing alignment output file at path: {}\n{}".
            format(aln_output, err))

    return bam_writer
Exemple #8
0
def run_whatshap(
    phase_input_files: List[str],
    variant_file: str,
    reference: Union[None, bool, str] = False,
    output: TextIO = sys.stdout,
    samples: List[str] = None,
    chromosomes: Optional[List[str]] = None,
    ignore_read_groups: bool = False,
    indels: bool = True,
    mapping_quality: int = 20,
    read_merging: bool = False,
    read_merging_error_rate: float = 0.15,
    read_merging_max_error_rate: float = 0.25,
    read_merging_positive_threshold: int = 1000000,
    read_merging_negative_threshold: int = 1000,
    max_coverage: int = 15,
    distrust_genotypes: bool = False,
    include_homozygous: bool = False,
    ped: Optional[str] = None,
    recombrate: float = 1.26,
    genmap: Optional[str] = None,
    genetic_haplotyping: bool = True,
    recombination_list_filename: Optional[str] = None,
    tag: str = "PS",
    read_list_filename: Optional[str] = None,
    gl_regularizer: Optional[float] = None,
    gtchange_list_filename: Optional[str] = None,
    default_gq: int = 30,
    write_command_line_header: bool = True,
    use_ped_samples: bool = False,
    algorithm: str = "whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA. If False: skip realignment. If None: complain if reference needed.
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        f"This is WhatsHap {__version__} running under Python {platform.python_version()}"
    )
    numeric_sample_ids = NumericSampleIds()
    command_line: Optional[str]
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    read_merger: ReadMergerBase
    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    indels=indels,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                None if reference is False else reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        if phased_input_reader.has_alignments and reference is None:
            raise CommandLineError(
                "A reference FASTA needs to be provided with -r/--reference; "
                "or use --no-reference at the expense of phasing quality.")

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        recombination_cost_computer = make_recombination_cost_computer(
            ped, genmap, recombrate)

        families, family_trios = setup_families(samples, ped, max_coverage)
        del samples
        for trios in family_trios.values():
            for trio in trios:
                # Ensure that all mentioned individuals have a numeric id
                _ = numeric_sample_ids[trio.child]

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        superreads: Dict[str, ReadSet]
        components: Dict
        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset))
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )
                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    dp_table: Union[HapChatCore, PedigreeDPTable]
                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    logger.info("%s cost: %d", problem_name,
                                dp_table.get_optimal_cost())

                with timers("components"):
                    overall_components = compute_overall_components(
                        accessible_positions,
                        all_reads,
                        distrust_genotypes,
                        family,
                        genetic_haplotyping,
                        homozygous_positions,
                        numeric_sample_ids,
                        superreads_list,
                    )
                    log_component_stats(overall_components,
                                        len(accessible_positions))

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations)

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)
Exemple #9
0
def run_polyphase(
    phase_input_files,
    variant_file,
    ploidy,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    verify_genotypes=False,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    tag="PS",
    include_haploid_sets=False,
    write_command_line_header=True,
    read_list_filename=None,
    ce_bundle_edges=False,
    min_overlap=2,
    plot_clusters=False,
    plot_threading=False,
    ce_refinements=5,
    block_cut_sensitivity=4,
):
    """
    Run Polyploid Phasing.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant-file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file like object
    samples -- names of samples to phase. An empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. An empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (polyploid) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    numeric_sample_ids = NumericSampleIds()
    with ExitStack() as stack:
        assert phase_input_files
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
            ))
        assert not phased_input_reader.has_vcfs

        if write_command_line_header:
            command_line = "(whatshap {}) {}".format(__version__,
                                                     " ".join(sys.argv[1:]))
        else:
            command_line = None
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    ploidy=ploidy,
                    include_haploid_sets=include_haploid_sets,
                ))
        except OSError as e:
            raise CommandLineError(e)

        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                phases=True,
                genotype_likelihoods=False,
                ploidy=ploidy,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if block_cut_sensitivity < 0:
            logger.warning(
                "Block cut sensitivity was set to negative value. Lowest value (0) is assumed instead."
            )
            block_cut_sensitivity = 0
        elif block_cut_sensitivity > 5:
            logger.warning(
                "Block cut sensitivity level too large. Assuming highest valid value (5) instead."
            )
            block_cut_sensitivity = 5

        samples = frozenset(samples)

        read_list_file = None
        if read_list_filename:
            raise NotImplementedError("create_read_list_file not implemented")
            # read_list_file = create_read_list_file(read_list_filename)

        # Store phasing parameters in tuple to keep function signatures cleaner
        phasing_param = PhasingParameter(
            ploidy=ploidy,
            verify_genotypes=verify_genotypes,
            ce_bundle_edges=ce_bundle_edges,
            min_overlap=min_overlap,
            ce_refinements=ce_refinements,
            block_cut_sensitivity=block_cut_sensitivity,
            plot_clusters=plot_clusters,
            plot_threading=plot_threading,
        )

        timers.start("parse_vcf")
        try:
            for variant_table in vcf_reader:
                chromosome = variant_table.chromosome
                timers.stop("parse_vcf")
                if (not chromosomes) or (chromosome in chromosomes):
                    logger.info("======== Working on chromosome %r",
                                chromosome)
                else:
                    logger.info(
                        "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                        chromosome,
                    )
                    with timers("write_vcf"):
                        superreads, components = dict(), dict()
                        vcf_writer.write(chromosome, superreads, components)
                    continue

                # These two variables hold the phasing results for all samples
                superreads, components, haploid_components = dict(), dict(
                ), dict()

                # Iterate over all samples to process
                for sample in samples:
                    logger.info("---- Processing individual %s", sample)

                    # Process inputs for this sample
                    missing_genotypes = set()
                    heterozygous = set()

                    genotypes = variant_table.genotypes_of(sample)
                    for index, gt in enumerate(genotypes):
                        if gt.is_none():
                            missing_genotypes.add(index)
                        elif not gt.is_homozygous():
                            heterozygous.add(index)
                        else:
                            assert gt.is_homozygous()
                    to_discard = set(range(
                        len(variant_table))).difference(heterozygous)
                    phasable_variant_table = deepcopy(variant_table)
                    # Remove calls to be discarded from variant table
                    phasable_variant_table.remove_rows_by_index(to_discard)

                    logger.info(
                        "Number of variants skipped due to missing genotypes: %d",
                        len(missing_genotypes),
                    )
                    logger.info(
                        "Number of remaining heterozygous variants: %d",
                        len(phasable_variant_table))

                    # Get the reads belonging to this sample
                    timers.start("read_bam")
                    readset, vcf_source_ids = phased_input_reader.read(
                        chromosome, phasable_variant_table.variants, sample)
                    readset.sort()
                    timers.stop("read_bam")

                    # Verify genotypes
                    if verify_genotypes:
                        timers.start("verify_genotypes")
                        logger.info("Verify genotyping of %s", sample)
                        positions = [
                            v.position for v in phasable_variant_table.variants
                        ]
                        computed_genotypes = [
                            Genotype(gt) for gt in compute_polyploid_genotypes(
                                readset, ploidy, positions)
                        ]
                        # skip all positions at which genotypes do not match
                        given_genotypes = phasable_variant_table.genotypes_of(
                            sample)
                        matching_genotypes = []
                        missing_genotypes = set()
                        print(computed_genotypes, len(computed_genotypes))
                        print(given_genotypes, len(given_genotypes))
                        print(len(positions))
                        for i, g in enumerate(given_genotypes):
                            c_g = computed_genotypes[i]
                            if (g == c_g) or (c_g is None):
                                matching_genotypes.append(g)
                            else:
                                matching_genotypes.append(Genotype([]))
                                missing_genotypes.add(i)
                        phasable_variant_table.set_genotypes_of(
                            sample, matching_genotypes)

                        # Remove variants with deleted genotype
                        phasable_variant_table.remove_rows_by_index(
                            missing_genotypes)
                        logger.info(
                            "Number of variants removed due to inconsistent genotypes: %d",
                            len(missing_genotypes),
                        )
                        logger.info(
                            "Number of remaining heterozygous variants: %d",
                            len(phasable_variant_table),
                        )

                        # Re-read the readset to remove discarded variants
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)
                        readset.sort()
                        timers.stop("verify_genotypes")

                    # Remove reads with insufficient variants
                    readset = readset.subset([
                        i for i, read in enumerate(readset)
                        if len(read) >= max(2, min_overlap)
                    ])
                    logger.info(
                        "Kept %d reads that cover at least two variants each",
                        len(readset))

                    # Adapt the variant table to the subset of reads
                    phasable_variant_table.subset_rows_by_position(
                        readset.get_positions())

                    # Run the actual phasing
                    (
                        sample_components,
                        sample_haploid_components,
                        sample_superreads,
                    ) = phase_single_individual(readset,
                                                phasable_variant_table, sample,
                                                phasing_param, output, timers)

                    # Collect results
                    components[sample] = sample_components
                    haploid_components[sample] = sample_haploid_components
                    superreads[sample] = sample_superreads

                with timers("write_vcf"):
                    logger.info("======== Writing VCF")
                    vcf_writer.write(
                        chromosome,
                        superreads,
                        components,
                        haploid_components if include_haploid_sets else None,
                    )
                    # TODO: Use genotype information to polish results
                    # assert len(changed_genotypes) == 0
                    logger.info("Done writing VCF")
                logger.debug("Chromosome %r finished", chromosome)
                timers.start("parse_vcf")
            timers.stop("parse_vcf")
        except PloidyError as e:
            raise CommandLineError(e)

    if read_list_file:
        read_list_file.close()

    logger.info("\n== SUMMARY ==")

    log_memory_usage()
    logger.info("Time spent reading BAM/CRAM:                 %6.1f s",
                timers.elapsed("read_bam"))
    logger.info("Time spent parsing VCF:                      %6.1f s",
                timers.elapsed("parse_vcf"))
    if verify_genotypes:
        logger.info(
            "Time spent verifying genotypes:              %6.1f s",
            timers.elapsed("verify_genotypes"),
        )
    logger.info("Time spent detecting blocks:                 %6.1f s",
                timers.elapsed("detecting_blocks"))
    logger.info("Time spent scoring reads:                    %6.1f s",
                timers.elapsed("read_scoring"))
    logger.info(
        "Time spent solving cluster editing:          %6.1f s",
        timers.elapsed("solve_clusterediting"),
    )
    logger.info("Time spent threading haplotypes:             %6.1f s",
                timers.elapsed("threading"))
    if plot_clusters or plot_threading:
        logger.info("Time spent creating plots:                   %6.1f s",
                    timers.elapsed("create_plots"))
    logger.info("Time spent writing VCF:                      %6.1f s",
                timers.elapsed("write_vcf"))
    logger.info("Time spent on rest:                          %6.1f s",
                timers.total() - timers.sum())
    logger.info("Total elapsed time:                          %6.1f s",
                timers.total())
Exemple #10
0
def run_compare(
    vcf,
    ploidy,
    names=None,
    sample=None,
    tsv_pairwise=None,
    tsv_multiway=None,
    only_snvs=False,
    switch_error_bed=None,
    plot_blocksizes=None,
    plot_sum_of_blocksizes=None,
    longest_block_tsv=None,
):
    vcf_readers = [VcfReader(f, indels=not only_snvs, phases=True, ploidy=ploidy) for f in vcf]
    if names:
        dataset_names = names.split(",")
        if len(dataset_names) != len(vcf):
            raise CommandLineError(
                "Number of names given with --names does not equal number of VCFs."
            )
    else:
        dataset_names = ["file{}".format(i) for i in range(len(vcf))]
    longest_name = max(len(n) for n in dataset_names)

    sample = get_sample_to_work_on(vcf_readers, requested_sample=sample)

    with ExitStack() as stack:
        tsv_pairwise_file = tsv_multiway_file = longest_block_tsv_file = switch_error_bedfile = None
        if tsv_pairwise:
            tsv_pairwise_file = stack.enter_context(open(tsv_pairwise, "w"))

        if tsv_multiway:
            tsv_multiway_file = stack.enter_context(open(tsv_multiway, "w"))
            print(
                "#sample",
                "chromosome",
                "dataset_list0",
                "dataset_list1",
                "count",
                sep="\t",
                file=tsv_multiway_file,
            )

        if longest_block_tsv:
            longest_block_tsv_file = stack.enter_context(open(longest_block_tsv, "w"))
            print(
                "#dataset_name0",
                "dataset_name1",
                "#sample",
                "chromosome",
                "position",
                "phase_agreeing",
                sep="\t",
                file=longest_block_tsv_file,
            )

        print("Comparing phasings for sample", sample)

        vcfs = get_variant_tables(vcf_readers, vcf)
        chromosomes = get_common_chromosomes(vcfs)
        if len(chromosomes) == 0:
            raise CommandLineError("No chromosome is contained in all VCFs. Aborting.")
        logger.info("Chromosomes present in all VCFs: %s", ", ".join(chromosomes))

        if tsv_pairwise_file:
            fields = [
                "#sample",
                "chromosome",
                "dataset_name0",
                "dataset_name1",
                "file_name0",
                "file_name1",
            ]
            field_names = [f.name for f in dataclasses.fields(PairwiseComparisonResults)]
            fields.extend(field_names)
            fields.extend(["het_variants0", "only_snvs"])
            print(*fields, sep="\t", file=tsv_pairwise_file)

        if switch_error_bed:
            switch_error_bedfile = stack.enter_context(open(switch_error_bed, "w"))

        print("FILENAMES")
        for name, filename in zip(dataset_names, vcf):
            print(name.rjust(longest_name + 2), "=", filename)

        width = max(longest_name, 15) + 5

        all_block_stats = [[] for _ in vcfs]

        def add_block_stats(block_stats):
            assert len(block_stats) == len(all_block_stats)
            for big_list, new_list in zip(all_block_stats, block_stats):
                big_list.extend(new_list)

        for chromosome in sorted(chromosomes):
            print("---------------- Chromosome {} ----------------".format(chromosome))
            all_bed_records = []
            variant_tables = [vcf[chromosome] for vcf in vcfs]
            all_variants_union = set()
            all_variants_intersection = None
            het_variants_union = set()
            het_variants_intersection = None
            het_variant_sets = []
            het_variants0 = None
            print("VARIANT COUNTS (heterozygous / all): ")
            for variant_table, name in zip(variant_tables, dataset_names):
                all_variants_union.update(variant_table.variants)
                het_variants = [
                    v
                    for v, gt in zip(variant_table.variants, variant_table.genotypes_of(sample))
                    if not gt.is_homozygous()
                ]
                if het_variants0 is None:
                    het_variants0 = len(het_variants)
                het_variants_union.update(het_variants)
                if all_variants_intersection is None:
                    all_variants_intersection = set(variant_table.variants)
                    het_variants_intersection = set(het_variants)
                else:
                    all_variants_intersection.intersection_update(variant_table.variants)
                    het_variants_intersection.intersection_update(het_variants)
                het_variant_sets.append(set(het_variants))
                print(
                    "{}:".format(name).rjust(width),
                    str(len(het_variants)).rjust(count_width),
                    "/",
                    str(len(variant_table.variants)).rjust(count_width),
                )
            print(
                "UNION:".rjust(width),
                str(len(het_variants_union)).rjust(count_width),
                "/",
                str(len(all_variants_union)).rjust(count_width),
            )
            print(
                "INTERSECTION:".rjust(width),
                str(len(het_variants_intersection)).rjust(count_width),
                "/",
                str(len(all_variants_intersection)).rjust(count_width),
            )

            for i in range(len(vcfs)):
                for j in range(i + 1, len(vcfs)):
                    print(
                        "PAIRWISE COMPARISON: {} <--> {}:".format(
                            dataset_names[i], dataset_names[j]
                        )
                    )
                    (
                        results,
                        bed_records,
                        block_stats,
                        longest_block_positions,
                        longest_block_agreement,
                        multiway_results,
                    ) = compare(
                        [variant_tables[i], variant_tables[j]],
                        sample,
                        [dataset_names[i], dataset_names[j]],
                        ploidy,
                    )
                    if len(vcfs) == 2:
                        add_block_stats(block_stats)
                    all_bed_records.extend(bed_records)
                    if tsv_pairwise_file:
                        fields = [
                            sample,
                            chromosome,
                            dataset_names[i],
                            dataset_names[j],
                            vcf[i],
                            vcf[j],
                        ]
                        fields.extend(dataclasses.astuple(results))
                        fields.extend([het_variants0, int(only_snvs)])
                        print(*fields, sep="\t", file=tsv_pairwise_file)
                    if longest_block_tsv_file:
                        assert ploidy == 2
                        assert len(longest_block_positions) == len(longest_block_agreement)
                        for position, phase_agreeing in zip(
                            longest_block_positions, longest_block_agreement
                        ):
                            print(
                                dataset_names[i],
                                dataset_names[j],
                                sample,
                                chromosome,
                                position,
                                phase_agreeing,
                                sep="\t",
                                file=longest_block_tsv_file,
                            )

            # if requested, write all switch errors found in the current chromosome to the bed file
            if switch_error_bedfile:
                assert ploidy == 2
                all_bed_records.sort()
                for record in all_bed_records:
                    print(*record, sep="\t", file=switch_error_bedfile)

            if len(vcfs) > 2:
                assert ploidy == 2
                print("MULTIWAY COMPARISON OF ALL PHASINGS:")
                (
                    results,
                    bed_records,
                    block_stats,
                    longest_block_positions,
                    longest_block_agreement,
                    multiway_results,
                ) = compare(variant_tables, sample, dataset_names, ploidy)
                add_block_stats(block_stats)
                if tsv_multiway_file:
                    for ((dataset_list0, dataset_list1), count,) in multiway_results.items():
                        print(
                            sample,
                            chromosome,
                            "{" + dataset_list0 + "}",
                            "{" + dataset_list1 + "}",
                            count,
                            sep="\t",
                            file=tsv_multiway_file,
                        )

        if plot_blocksizes:
            create_blocksize_histogram(plot_blocksizes, all_block_stats, dataset_names)
        if plot_sum_of_blocksizes:
            create_blocksize_histogram(
                plot_sum_of_blocksizes, all_block_stats, dataset_names, use_weights=True
            )
Exemple #11
0
def create_blocksize_histogram(filename, block_stats, names, use_weights=False):
    try:
        import matplotlib
        import numpy

        matplotlib.use("pdf")
        from matplotlib import pyplot
        from matplotlib.backends.backend_pdf import PdfPages
    except ImportError:
        raise CommandLineError(
            "To use option --plot-blocksizes, you need to have numpy and matplotlib installed."
        )

    assert len(block_stats) == len(names)

    color_list = ["#ffa347", "#0064c8", "#b42222", "#22a5b4", "#b47c22", "#6db6ff"]
    if len(color_list) < len(block_stats):
        color_count = len(block_stats)
        color_list = pyplot.cm.Set1([n / color_count for n in range(color_count)])
    colors = color_list[: len(block_stats)]

    with PdfPages(filename) as pdf:
        for what, xlabel in [
            (lambda stats: stats.variant_count, "variant count"),
            (lambda stats: stats.span, "span [bp]"),
        ]:
            pyplot.figure(figsize=(10, 8))
            max_value = max(what(stats) for stats in chain(*block_stats))
            common_bins = numpy.logspace(0, math.ceil(math.log10(max_value)), 50)
            for l, name, color in zip(block_stats, names, colors):
                x = [what(stats) for stats in l]
                n, bins, patches = pyplot.hist(
                    x,
                    bins=common_bins,
                    alpha=0.6,
                    color=color,
                    label=name,
                    weights=x if use_weights else None,
                )
            pyplot.xlabel(xlabel)
            pyplot.ylabel("Number of blocks")
            pyplot.gca().set_xscale("log")
            pyplot.gca().set_yscale("log")
            pyplot.grid(True)
            pyplot.legend()
            pdf.savefig()
            pyplot.close()

            pyplot.figure(figsize=(10, 8))
            common_bins = numpy.logspace(0, math.ceil(math.log10(max_value)), 25)
            x = [[what(stats) for stats in l] for l in block_stats]
            n, bins, patches = pyplot.hist(
                x,
                bins=common_bins,
                alpha=0.6,
                color=colors,
                label=names,
                weights=x if use_weights else None,
            )
            pyplot.xlabel(xlabel)
            pyplot.ylabel("Number of blocks")
            pyplot.gca().set_xscale("log")
            pyplot.gca().set_yscale("log")
            pyplot.grid(True)
            pyplot.legend()
            pdf.savefig()
            pyplot.close()
Exemple #12
0
def run_genotype(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    max_coverage=15,
    nopriors=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    gt_qual_threshold=0,
    prioroutput=None,
    constant=0.0,
    overhang=10,
    affine_gap=False,
    gap_start=10,
    gap_extend=7,
    mismatch=15,
    write_command_line_header=True,
    use_ped_samples=False,
):
    """
    For now: this function only runs the genotyping algorithm. Genotype likelihoods for
    all variants are computed using the forward backward algorithm
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (genotyping) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None
    with ExitStack() as stack:
        # read the given input files (BAMs, VCFs, ref...)
        numeric_sample_ids = NumericSampleIds()
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
                overhang=overhang,
                affine=affine_gap,
                gap_start=gap_start,
                gap_extend=gap_extend,
                default_mismatch=mismatch,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # vcf writer for final genotype likelihoods
        vcf_writer = stack.enter_context(
            GenotypeVcfWriter(command_line=command_line,
                              in_path=variant_file,
                              out_file=output))
        # vcf writer for only the prior likelihoods (if output is desired)
        prior_vcf_writer = None
        if prioroutput is not None:
            prior_vcf_writer = stack.enter_context(
                GenotypeVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=stack.enter_context(open(prioroutput, "w")),
                ))

        # parse vcf with input variants
        # remove all likelihoods that may already be present
        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                genotype_likelihoods=False,
                ignore_genotypes=True,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = set()
            for trio in PedReader(ped):
                if trio.child is None or trio.mother is None or trio.father is None:
                    continue
                samples.add(trio.mother)
                samples.add(trio.father)
                samples.add(trio.child)

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            recombination_cost_computer = GeneticMapRecombinationCostComputer(
                genmap)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        # Read phase information provided as VCF files, if provided.
        with timers("parse_phasing_vcfs"):
            phased_input_reader.read_vcfs()

        # compute genotype likelihood threshold
        gt_prob = 1.0 - (10**(-gt_qual_threshold / 10.0))

        for variant_table in timers.iterate("parse_vcf", vcf_reader):

            # create a mapping of genome positions to indices
            var_to_pos = dict()
            for i in range(len(variant_table.variants)):
                var_to_pos[variant_table.variants[i].position] = i

            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                vcf_writer.write_genotypes(chromosome,
                                           variant_table,
                                           indels,
                                           leave_unchanged=True)
                if prioroutput is not None:
                    prior_vcf_writer.write_genotypes(chromosome,
                                                     variant_table,
                                                     indels,
                                                     leave_unchanged=True)
                continue

            positions = [v.position for v in variant_table.variants]
            if not nopriors:
                # compute prior genotype likelihoods based on all reads
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                            read_vcf=False,
                        )
                        readset.sort()
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        # recompute genotypes based on given threshold
                        reg_genotype_likelihoods = []
                        for gl in range(len(genotype_likelihoods)):
                            norm_sum = (genotype_likelihoods[gl][0] +
                                        genotype_likelihoods[gl][1] +
                                        genotype_likelihoods[gl][2] +
                                        3 * constant)
                            regularized = PhredGenotypeLikelihoods([
                                (genotype_likelihoods[gl][0] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][1] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][2] + constant) /
                                norm_sum,
                            ])
                            genotypes[gl] = determine_genotype(
                                regularized, gt_prob)
                            assert isinstance(genotypes[gl], Genotype)
                            reg_genotype_likelihoods.append(regularized)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                PhredGenotypeLikelihoods(list(gl))
                                for gl in reg_genotype_likelihoods
                            ],
                        )
                        variant_table.set_genotypes_of(sample, genotypes)
            else:

                # use uniform genotype likelihoods for all individuals
                for sample in samples:
                    variant_table.set_genotype_likelihoods_of(
                        sample,
                        [PhredGenotypeLikelihoods([1 / 3, 1 / 3, 1 / 3])] *
                        len(positions),
                    )

            # if desired, output the priors in separate vcf
            if prioroutput is not None:
                prior_vcf_writer.write_genotypes(chromosome, variant_table,
                                                 indels)

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert (len(family) == 1) or (len(trios) > 0)

                # Get the reads belonging to each sample
                readsets = dict()
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                        )

                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        selected_reads = select_reads(
                            readset,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )
                    readsets[sample] = selected_reads

                # Merge reads into one ReadSet (note that each Read object
                # knows the sample it originated from).
                all_reads = ReadSet()
                for sample, readset in readsets.items():
                    for read in readset:
                        assert read.is_sorted(), "Add a read.sort() here"
                        all_reads.add(read)

                all_reads.sort()

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )

                # Create Pedigree
                pedigree = Pedigree(numeric_sample_ids)
                for sample in family:
                    # genotypes are assumed to be unknown, so ignore information that
                    # might already be present in the input vcf
                    all_genotype_likelihoods = variant_table.genotype_likelihoods_of(
                        sample)
                    genotype_l = [
                        all_genotype_likelihoods[var_to_pos[a_p]]
                        for a_p in accessible_positions
                    ]
                    pedigree.add_individual(
                        sample,
                        [
                            Genotype([])
                            for i in range(len(accessible_positions))
                        ],
                        genotype_l,
                    )
                for trio in trios:
                    pedigree.add_relationship(
                        father_id=trio.father,
                        mother_id=trio.mother,
                        child_id=trio.child,
                    )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run genotyping algorithm
                with timers("genotyping"):
                    problem_name = "genotyping"
                    logger.info(
                        "Genotype %d sample%s by solving the %s problem ...",
                        len(family),
                        "s" if len(family) > 1 else "",
                        problem_name,
                    )
                    forward_backward_table = GenotypeDPTable(
                        numeric_sample_ids,
                        all_reads,
                        recombination_costs,
                        pedigree,
                        accessible_positions,
                    )
                    # store results
                    for s in family:
                        likelihood_list = variant_table.genotype_likelihoods_of(
                            s)
                        genotypes_list = variant_table.genotypes_of(s)

                        for pos in range(len(accessible_positions)):
                            likelihoods = forward_backward_table.get_genotype_likelihoods(
                                s, pos)

                            # compute genotypes from likelihoods and store information
                            geno = determine_genotype(likelihoods, gt_prob)
                            assert isinstance(geno, Genotype)
                            genotypes_list[var_to_pos[
                                accessible_positions[pos]]] = geno
                            likelihood_list[var_to_pos[
                                accessible_positions[pos]]] = likelihoods

                        variant_table.set_genotypes_of(s, genotypes_list)
                        variant_table.set_genotype_likelihoods_of(
                            s, likelihood_list)

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                vcf_writer.write_genotypes(chromosome, variant_table, indels)
                logger.info("Done writing VCF")

            logger.debug("Chromosome %r finished", chromosome)

    logger.info("\n== SUMMARY ==")
    total_time = timers.total()
    log_memory_usage()
    logger.info(
        "Time spent reading BAM:                      %6.1f s",
        timers.elapsed("read_bam"),
    )
    logger.info(
        "Time spent parsing VCF:                      %6.1f s",
        timers.elapsed("parse_vcf"),
    )
    if show_phase_vcfs:
        logger.info(
            "Time spent parsing input phasings from VCFs: %6.1f s",
            timers.elapsed("parse_phasing_vcfs"),
        )
    logger.info("Time spent selecting reads:                  %6.1f s",
                timers.elapsed("select"))
    logger.info(
        "Time spent genotyping:                          %6.1f s",
        timers.elapsed("genotyping"),
    )
    logger.info(
        "Time spent writing VCF:                      %6.1f s",
        timers.elapsed("write_vcf"),
    )
    logger.info(
        "Time spent on rest:                          %6.1f s",
        total_time - timers.sum(),
    )
    logger.info("Total elapsed time:                          %6.1f s",
                total_time)
Exemple #13
0
def run_haplotag(
    variant_file,
    alignment_file,
    output=None,
    reference=None,
    regions=None,
    ignore_linked_read=False,
    given_samples=None,
    linked_read_distance_cutoff=50000,
    ignore_read_groups=False,
    haplotag_list=None,
    tag_supplementary=False,
):

    timers = StageTimer()
    timers.start("haplotag-run")

    with ExitStack() as stack:
        timers.start("haplotag-init")
        try:
            vcf_reader = stack.enter_context(
                VcfReader(variant_file, indels=True, phases=True))
        except OSError as err:
            raise CommandLineError(
                "Error while loading variant file {}: {}".format(
                    variant_file, err))

        use_vcf_samples = compute_variant_file_samples_to_use(
            vcf_reader.samples, given_samples, ignore_read_groups)

        try:
            bam_reader = stack.enter_context(
                pysam.AlignmentFile(alignment_file, "rb", require_index=True))
        except OSError as err:
            raise CommandLineError(
                "Error while loading alignment file {}: {}".format(
                    alignment_file, err))
        # This checks also sample compatibility with VCF
        shared_samples = compute_shared_samples(bam_reader, ignore_read_groups,
                                                use_vcf_samples)

        # Check if user has specified a subset of regions per chromosome
        user_regions = normalize_user_regions(regions, bam_reader.references)

        phased_input_reader = stack.enter_context(
            PhasedInputReader([alignment_file],
                              reference,
                              NumericSampleIds(),
                              ignore_read_groups,
                              indels=False))

        bam_writer = stack.enter_context(
            open_output_alignment_file(output, reference, md5_of(variant_file),
                                       bam_reader.header.to_dict()))
        haplotag_writer = stack.enter_context(
            open_haplotag_writer(haplotag_list))

        timers.stop("haplotag-init")
        logger.debug("All input/output files initialized (time: {})".format(
            timers.elapsed("haplotag-init")))
        timers.start("haplotag-process")

        n_alignments = 0
        n_tagged = 0
        n_multiple_phase_sets = 0

        for chrom, regions in user_regions.items():
            logger.debug("Processing chromosome {}".format(chrom))

            # If there are no alignments for this chromosome, skip it. This allows to have
            # extra chromosomes in the BAM compared to the VCF as long as they are not actually
            # used.
            has_any_alignments = False
            for _ in bam_reader.fetch(contig=chrom):
                has_any_alignments = True
                break
            if not has_any_alignments:
                continue
            try:
                variant_table = load_chromosome_variants(
                    vcf_reader, chrom, regions)
            except VcfError as e:
                raise CommandLineError(str(e))
            if variant_table is not None:
                logger.debug("Preparing haplotype information")
                (BX_tag_to_haplotype, read_to_haplotype,
                 n_mult) = prepare_haplotag_information(
                     variant_table,
                     shared_samples,
                     phased_input_reader,
                     regions,
                     ignore_linked_read,
                     linked_read_distance_cutoff,
                 )
                n_multiple_phase_sets += n_mult
            else:
                # avoid uninitialized variables
                BX_tag_to_haplotype = None
                read_to_haplotype = None

            for start, end in regions:
                logger.debug("Iterating chromosome regions")
                for alignment in bam_reader.fetch(contig=chrom,
                                                  start=start,
                                                  stop=end):
                    n_alignments += 1
                    haplotype_name = "none"
                    phaseset = "none"
                    alignment.set_tag("HP", value=None)
                    alignment.set_tag("PC", value=None)
                    alignment.set_tag("PS", value=None)
                    if variant_table is None or ignore_read(
                            alignment, tag_supplementary):
                        # - If no variants in VCF for this chromosome,
                        # alignments just get written to output
                        # - Ignored reads are simply
                        # written to the output BAM
                        pass
                    else:
                        (is_tagged, haplotype_name,
                         phaseset) = attempt_add_phase_information(
                             alignment,
                             read_to_haplotype,
                             BX_tag_to_haplotype,
                             linked_read_distance_cutoff,
                             ignore_linked_read,
                         )
                        n_tagged += is_tagged

                    bam_writer.write(alignment)
                    if not (alignment.is_secondary
                            or alignment.is_supplementary):
                        print(
                            alignment.query_name,
                            haplotype_name,
                            phaseset,
                            chrom,
                            sep="\t",
                            file=haplotag_writer,
                        )

                    if n_alignments % 100000 == 0:
                        logger.debug("Processed {} alignment records.".format(
                            n_alignments))
        timers.stop("haplotag-process")
        logger.debug("Processing complete (time: {})".format(
            timers.elapsed("haplotag-process")))

    timers.stop("haplotag-run")

    logger.info("\n== SUMMARY ==")
    logger.info("Total alignments processed:              %12d", n_alignments)
    logger.info("Alignments that could be tagged:         %12d", n_tagged)
    logger.info("Alignments spanning multiple phase sets: %12d",
                n_multiple_phase_sets)
    logger.info("haplotag - total processing time: {}".format(
        timers.elapsed("haplotag-run")))
Exemple #14
0
def run_whatshap(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    read_merging=False,
    read_merging_error_rate=0.15,
    read_merging_max_error_rate=0.25,
    read_merging_positive_threshold=1000000,
    read_merging_negative_threshold=1000,
    max_coverage=15,
    full_genotyping=False,
    distrust_genotypes=False,
    include_homozygous=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    genetic_haplotyping=True,
    recombination_list_filename=None,
    tag="PS",
    read_list_filename=None,
    gl_regularizer=None,
    gtchange_list_filename=None,
    default_gq=30,
    write_command_line_header=True,
    use_ped_samples=False,
    algorithm="whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    full_genotyping
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        "This is WhatsHap %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if full_genotyping:
        distrust_genotypes = True
        include_homozygous = True
    numeric_sample_ids = NumericSampleIds()
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            try:
                recombination_cost_computer = GeneticMapRecombinationCostComputer(
                    genmap)
            except ParseError as e:
                raise CommandLineError(e)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            if full_genotyping:
                positions = [v.position for v in variant_table.variants]
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        bam_sample = None if ignore_read_groups else sample
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            bam_sample,
                            read_vcf=False,
                        )
                        readset.sort()  # TODO can be removed
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        variant_table.set_genotypes_of(sample, genotypes)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                GenotypeLikelihoods(gl)
                                for gl in genotype_likelihoods
                            ],
                        )

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            phasable_variant_table.variants,
                            sample,
                        )

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    optimal_cost = dp_table.get_optimal_cost()
                    logger.info("%s cost: %d", problem_name, optimal_cost)

                with timers("components"):
                    master_block = None
                    heterozygous_positions_by_sample = None
                    # If we distrusted genotypes, we need to re-determine which sites are h**o-/heterozygous after phasing
                    if distrust_genotypes:
                        hom_in_any_sample = set()
                        heterozygous_positions_by_sample = {}
                        heterozygous_gts = frozenset({(0, 1), (1, 0)})
                        homozygous_gts = frozenset({(0, 0), (1, 1)})
                        for sample, sample_superreads in zip(
                                family, superreads_list):
                            hets = set()
                            for v1, v2 in zip(*sample_superreads):
                                assert v1.position == v2.position
                                if v1.position not in accessible_positions:
                                    continue
                                gt = (v1.allele, v2.allele)
                                if gt in heterozygous_gts:
                                    hets.add(v1.position)
                                elif gt in homozygous_gts:
                                    hom_in_any_sample.add(v1.position)
                            heterozygous_positions_by_sample[
                                numeric_sample_ids[sample]] = hets
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(hom_in_any_sample)
                    else:
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(
                                set(homozygous_positions).intersection(
                                    set(accessible_positions)))
                    overall_components = find_components(
                        accessible_positions,
                        all_reads,
                        master_block,
                        heterozygous_positions_by_sample,
                    )
                    n_phased_blocks = len(set(overall_components.values()))
                    logger.info("No. of phased blocks: %d", n_phased_blocks)
                    largest_component = find_largest_component(
                        overall_components)
                    if len(largest_component) > 0:
                        logger.info(
                            "Largest component contains %d variants (%.1f%% of accessible variants) between position %d and %d",
                            len(largest_component),
                            len(largest_component) * 100.0 /
                            len(accessible_positions),
                            largest_component[0] + 1,
                            largest_component[-1] + 1,
                        )

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations,
                    )

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)