コード例 #1
0
ファイル: haplotag.py プロジェクト: ekg/graphappy
def run_haplotag(
    variant_file,
    alignment_file,
    output=None,
    reference=None,
    regions=None,
    ignore_linked_read=False,
    given_samples=None,
    linked_read_distance_cutoff=50000,
    ignore_read_groups=False,
    haplotag_list=None,
    tag_supplementary=False,
):

    timers = StageTimer()
    timers.start("haplotag-run")

    with ExitStack() as stack:
        timers.start("haplotag-init")
        try:
            vcf_reader = stack.enter_context(
                VcfReader(variant_file, indels=True, phases=True))
        except OSError as err:
            logger.error("Error while loading variant file {}: {}".format(
                variant_file, err))
            raise err

        use_vcf_samples = compute_variant_file_samples_to_use(
            vcf_reader, given_samples, ignore_read_groups)

        try:
            bam_reader = stack.enter_context(
                pysam.AlignmentFile(alignment_file, "rb", require_index=True))
        except OSError as err:
            logger.error("Error while loading alignment file {}: {}".format(
                alignment_file, err))
            raise err
        # This checks also sample compatibility with VCF
        shared_samples = compute_shared_samples(bam_reader, ignore_read_groups,
                                                use_vcf_samples)

        # Check if user has specified a subset of regions per chromosome
        user_regions = normalize_user_regions(regions, bam_reader.references)

        phased_input_reader = stack.enter_context(
            PhasedInputReader([alignment_file],
                              reference,
                              NumericSampleIds(),
                              ignore_read_groups,
                              indels=False))

        bam_writer = stack.enter_context(
            open_output_alignment_file(
                output,
                reference,
                md5_of(variant_file),
                bam_reader.header.to_dict(),
            ))
        haplotag_writer = stack.enter_context(
            open_haplotag_writer(haplotag_list))

        timers.stop("haplotag-init")
        logger.debug("All input/output files initialized (time: {})".format(
            timers.elapsed("haplotag-init")))
        timers.start("haplotag-process")

        n_alignments = 0
        n_tagged = 0
        n_multiple_phase_sets = 0

        for chrom, regions in user_regions.items():
            logger.debug("Processing chromosome {}".format(chrom))
            variant_table = load_chromosome_variants(vcf_reader, chrom,
                                                     regions)
            if variant_table is not None:
                logger.debug("Preparing haplotype information")

                (BX_tag_to_haplotype, read_to_haplotype,
                 n_mult) = prepare_haplotag_information(
                     variant_table,
                     shared_samples,
                     phased_input_reader,
                     regions,
                     ignore_linked_read,
                     linked_read_distance_cutoff,
                 )
                n_multiple_phase_sets += n_mult
            else:
                # avoid uninitialized variables
                BX_tag_to_haplotype = None
                read_to_haplotype = None

            for start, end in regions:
                logger.debug("Iterating chromosome regions")
                for alignment in bam_reader.fetch(contig=chrom,
                                                  start=start,
                                                  stop=end):
                    n_alignments += 1
                    haplotype_name = "none"
                    phaseset = "none"
                    alignment.set_tag("HP", value=None)
                    alignment.set_tag("PC", value=None)
                    alignment.set_tag("PS", value=None)
                    if variant_table is None or ignore_read(
                            alignment, tag_supplementary):
                        # - If no variants in VCF for this chromosome,
                        # alignments just get written to output
                        # - Ignored reads are simply
                        # written to the output BAM
                        pass
                    else:
                        (is_tagged, haplotype_name,
                         phaseset) = attempt_add_phase_information(
                             alignment,
                             read_to_haplotype,
                             BX_tag_to_haplotype,
                             linked_read_distance_cutoff,
                         )
                        n_tagged += is_tagged

                    bam_writer.write(alignment)
                    if not (alignment.is_secondary
                            or alignment.is_supplementary):
                        print(
                            alignment.query_name,
                            haplotype_name,
                            phaseset,
                            chrom,
                            sep="\t",
                            file=haplotag_writer,
                        )

                    if n_alignments % 100000 == 0:
                        logger.debug("Processed {} alignment records.".format(
                            n_alignments))
        timers.stop("haplotag-process")
        logger.debug("Processing complete (time: {})".format(
            timers.elapsed("haplotag-process")))

    timers.stop("haplotag-run")

    logger.info("\n== SUMMARY ==")
    logger.info("Total alignments processed:              %12d", n_alignments)
    logger.info("Alignments that could be tagged:         %12d", n_tagged)
    logger.info("Alignments spanning multiple phase sets: %12d",
                n_multiple_phase_sets)
    logger.info("haplotag - total processing time: {}".format(
        timers.elapsed("haplotag-run")))
コード例 #2
0
def run_polyphase(
    phase_input_files,
    variant_file,
    ploidy,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    verify_genotypes=False,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    tag="PS",
    include_haploid_sets=False,
    write_command_line_header=True,
    read_list_filename=None,
    ce_bundle_edges=False,
    min_overlap=2,
    plot_clusters=False,
    plot_threading=False,
    ce_refinements=5,
    block_cut_sensitivity=4,
):
    """
    Run Polyploid Phasing.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant-file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file like object
    samples -- names of samples to phase. An empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. An empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (polyploid) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    numeric_sample_ids = NumericSampleIds()
    with ExitStack() as stack:
        assert phase_input_files
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
            ))
        assert not phased_input_reader.has_vcfs

        if write_command_line_header:
            command_line = "(whatshap {}) {}".format(__version__,
                                                     " ".join(sys.argv[1:]))
        else:
            command_line = None
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    ploidy=ploidy,
                    include_haploid_sets=include_haploid_sets,
                ))
        except OSError as e:
            raise CommandLineError(e)

        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                phases=True,
                genotype_likelihoods=False,
                ploidy=ploidy,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if block_cut_sensitivity < 0:
            logger.warning(
                "Block cut sensitivity was set to negative value. Lowest value (0) is assumed instead."
            )
            block_cut_sensitivity = 0
        elif block_cut_sensitivity > 5:
            logger.warning(
                "Block cut sensitivity level too large. Assuming highest valid value (5) instead."
            )
            block_cut_sensitivity = 5

        samples = frozenset(samples)

        read_list_file = None
        if read_list_filename:
            raise NotImplementedError("create_read_list_file not implemented")
            # read_list_file = create_read_list_file(read_list_filename)

        # Store phasing parameters in tuple to keep function signatures cleaner
        phasing_param = PhasingParameter(
            ploidy=ploidy,
            verify_genotypes=verify_genotypes,
            ce_bundle_edges=ce_bundle_edges,
            min_overlap=min_overlap,
            ce_refinements=ce_refinements,
            block_cut_sensitivity=block_cut_sensitivity,
            plot_clusters=plot_clusters,
            plot_threading=plot_threading,
        )

        timers.start("parse_vcf")
        try:
            for variant_table in vcf_reader:
                chromosome = variant_table.chromosome
                timers.stop("parse_vcf")
                if (not chromosomes) or (chromosome in chromosomes):
                    logger.info("======== Working on chromosome %r",
                                chromosome)
                else:
                    logger.info(
                        "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                        chromosome,
                    )
                    with timers("write_vcf"):
                        superreads, components = dict(), dict()
                        vcf_writer.write(chromosome, superreads, components)
                    continue

                # These two variables hold the phasing results for all samples
                superreads, components, haploid_components = dict(), dict(
                ), dict()

                # Iterate over all samples to process
                for sample in samples:
                    logger.info("---- Processing individual %s", sample)

                    # Process inputs for this sample
                    missing_genotypes = set()
                    heterozygous = set()

                    genotypes = variant_table.genotypes_of(sample)
                    for index, gt in enumerate(genotypes):
                        if gt.is_none():
                            missing_genotypes.add(index)
                        elif not gt.is_homozygous():
                            heterozygous.add(index)
                        else:
                            assert gt.is_homozygous()
                    to_discard = set(range(
                        len(variant_table))).difference(heterozygous)
                    phasable_variant_table = deepcopy(variant_table)
                    # Remove calls to be discarded from variant table
                    phasable_variant_table.remove_rows_by_index(to_discard)

                    logger.info(
                        "Number of variants skipped due to missing genotypes: %d",
                        len(missing_genotypes),
                    )
                    logger.info(
                        "Number of remaining heterozygous variants: %d",
                        len(phasable_variant_table))

                    # Get the reads belonging to this sample
                    timers.start("read_bam")
                    readset, vcf_source_ids = phased_input_reader.read(
                        chromosome, phasable_variant_table.variants, sample)
                    readset.sort()
                    timers.stop("read_bam")

                    # Verify genotypes
                    if verify_genotypes:
                        timers.start("verify_genotypes")
                        logger.info("Verify genotyping of %s", sample)
                        positions = [
                            v.position for v in phasable_variant_table.variants
                        ]
                        computed_genotypes = [
                            Genotype(gt) for gt in compute_polyploid_genotypes(
                                readset, ploidy, positions)
                        ]
                        # skip all positions at which genotypes do not match
                        given_genotypes = phasable_variant_table.genotypes_of(
                            sample)
                        matching_genotypes = []
                        missing_genotypes = set()
                        print(computed_genotypes, len(computed_genotypes))
                        print(given_genotypes, len(given_genotypes))
                        print(len(positions))
                        for i, g in enumerate(given_genotypes):
                            c_g = computed_genotypes[i]
                            if (g == c_g) or (c_g is None):
                                matching_genotypes.append(g)
                            else:
                                matching_genotypes.append(Genotype([]))
                                missing_genotypes.add(i)
                        phasable_variant_table.set_genotypes_of(
                            sample, matching_genotypes)

                        # Remove variants with deleted genotype
                        phasable_variant_table.remove_rows_by_index(
                            missing_genotypes)
                        logger.info(
                            "Number of variants removed due to inconsistent genotypes: %d",
                            len(missing_genotypes),
                        )
                        logger.info(
                            "Number of remaining heterozygous variants: %d",
                            len(phasable_variant_table),
                        )

                        # Re-read the readset to remove discarded variants
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)
                        readset.sort()
                        timers.stop("verify_genotypes")

                    # Remove reads with insufficient variants
                    readset = readset.subset([
                        i for i, read in enumerate(readset)
                        if len(read) >= max(2, min_overlap)
                    ])
                    logger.info(
                        "Kept %d reads that cover at least two variants each",
                        len(readset))

                    # Adapt the variant table to the subset of reads
                    phasable_variant_table.subset_rows_by_position(
                        readset.get_positions())

                    # Run the actual phasing
                    (
                        sample_components,
                        sample_haploid_components,
                        sample_superreads,
                    ) = phase_single_individual(readset,
                                                phasable_variant_table, sample,
                                                phasing_param, output, timers)

                    # Collect results
                    components[sample] = sample_components
                    haploid_components[sample] = sample_haploid_components
                    superreads[sample] = sample_superreads

                with timers("write_vcf"):
                    logger.info("======== Writing VCF")
                    vcf_writer.write(
                        chromosome,
                        superreads,
                        components,
                        haploid_components if include_haploid_sets else None,
                    )
                    # TODO: Use genotype information to polish results
                    # assert len(changed_genotypes) == 0
                    logger.info("Done writing VCF")
                logger.debug("Chromosome %r finished", chromosome)
                timers.start("parse_vcf")
            timers.stop("parse_vcf")
        except PloidyError as e:
            raise CommandLineError(e)

    if read_list_file:
        read_list_file.close()

    logger.info("\n== SUMMARY ==")

    log_memory_usage()
    logger.info("Time spent reading BAM/CRAM:                 %6.1f s",
                timers.elapsed("read_bam"))
    logger.info("Time spent parsing VCF:                      %6.1f s",
                timers.elapsed("parse_vcf"))
    if verify_genotypes:
        logger.info(
            "Time spent verifying genotypes:              %6.1f s",
            timers.elapsed("verify_genotypes"),
        )
    logger.info("Time spent detecting blocks:                 %6.1f s",
                timers.elapsed("detecting_blocks"))
    logger.info("Time spent scoring reads:                    %6.1f s",
                timers.elapsed("read_scoring"))
    logger.info(
        "Time spent solving cluster editing:          %6.1f s",
        timers.elapsed("solve_clusterediting"),
    )
    logger.info("Time spent threading haplotypes:             %6.1f s",
                timers.elapsed("threading"))
    if plot_clusters or plot_threading:
        logger.info("Time spent creating plots:                   %6.1f s",
                    timers.elapsed("create_plots"))
    logger.info("Time spent writing VCF:                      %6.1f s",
                timers.elapsed("write_vcf"))
    logger.info("Time spent on rest:                          %6.1f s",
                timers.total() - timers.sum())
    logger.info("Total elapsed time:                          %6.1f s",
                timers.total())
コード例 #3
0
def run_whatshap(
    phase_input_files: List[str],
    variant_file: str,
    reference: Union[None, bool, str] = False,
    output: TextIO = sys.stdout,
    samples: List[str] = None,
    chromosomes: Optional[List[str]] = None,
    ignore_read_groups: bool = False,
    indels: bool = True,
    mapping_quality: int = 20,
    read_merging: bool = False,
    read_merging_error_rate: float = 0.15,
    read_merging_max_error_rate: float = 0.25,
    read_merging_positive_threshold: int = 1000000,
    read_merging_negative_threshold: int = 1000,
    max_coverage: int = 15,
    distrust_genotypes: bool = False,
    include_homozygous: bool = False,
    ped: Optional[str] = None,
    recombrate: float = 1.26,
    genmap: Optional[str] = None,
    genetic_haplotyping: bool = True,
    recombination_list_filename: Optional[str] = None,
    tag: str = "PS",
    read_list_filename: Optional[str] = None,
    gl_regularizer: Optional[float] = None,
    gtchange_list_filename: Optional[str] = None,
    default_gq: int = 30,
    write_command_line_header: bool = True,
    use_ped_samples: bool = False,
    algorithm: str = "whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA. If False: skip realignment. If None: complain if reference needed.
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        f"This is WhatsHap {__version__} running under Python {platform.python_version()}"
    )
    numeric_sample_ids = NumericSampleIds()
    command_line: Optional[str]
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    read_merger: ReadMergerBase
    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    indels=indels,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                None if reference is False else reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        if phased_input_reader.has_alignments and reference is None:
            raise CommandLineError(
                "A reference FASTA needs to be provided with -r/--reference; "
                "or use --no-reference at the expense of phasing quality.")

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        recombination_cost_computer = make_recombination_cost_computer(
            ped, genmap, recombrate)

        families, family_trios = setup_families(samples, ped, max_coverage)
        del samples
        for trios in family_trios.values():
            for trio in trios:
                # Ensure that all mentioned individuals have a numeric id
                _ = numeric_sample_ids[trio.child]

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        superreads: Dict[str, ReadSet]
        components: Dict
        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset))
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )
                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    dp_table: Union[HapChatCore, PedigreeDPTable]
                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    logger.info("%s cost: %d", problem_name,
                                dp_table.get_optimal_cost())

                with timers("components"):
                    overall_components = compute_overall_components(
                        accessible_positions,
                        all_reads,
                        distrust_genotypes,
                        family,
                        genetic_haplotyping,
                        homozygous_positions,
                        numeric_sample_ids,
                        superreads_list,
                    )
                    log_component_stats(overall_components,
                                        len(accessible_positions))

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations)

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)
コード例 #4
0
def run_split(
    reads_file,
    list_file,
    output_h1=None,
    output_h2=None,
    output_untagged=None,
    add_untagged=False,
    pigz_deprecated=False,
    only_largest_block=False,
    discard_unknown_reads=False,
    read_lengths_histogram=None,
):
    if pigz_deprecated:
        logger.warning("Ignoring deprecated --pigz option")
    timers = StageTimer()
    timers.start("split-run")

    with ExitStack() as stack:
        timers.start("split-init")

        # TODO: obviously this won't work for more than two haplotypes
        haplotype_to_int = {"none": 0, "H1": 1, "H2": 2}

        haplo_list, has_haplo_chrom_info, line_parser = check_haplotag_list_information(
            list_file, stack)

        if only_largest_block:
            logger.debug(
                'User selected "--only-largest-block", this requires chromosome '
                "and phaseset information to be present in the haplotag list file."
            )
            if not has_haplo_chrom_info:
                raise ValueError(
                    "The haplotag list file does not contain phaseset and chromosome "
                    "information, which is required to select only reads from the "
                    "largest phased block. Columns 3 and 4 are missing.")

        timers.start("split-process-haplotag-list")

        readname_to_haplotype, known_reads = process_haplotag_list_file(
            haplo_list,
            line_parser,
            haplotype_to_int,
            only_largest_block,
            discard_unknown_reads,
        )
        if discard_unknown_reads:
            logger.debug(
                "User selected to discard unknown reads, i.e., ignore all reads "
                "that are not part of the haplotag list input file.")
            assert (
                len(known_reads) > 0
            ), "No known reads in input set - would discard everything, this is probably wrong"
            missing_reads = len(known_reads)
        else:
            missing_reads = -1

        timers.stop("split-process-haplotag-list")

        input_reader, input_iterator, output_writers = initialize_io_files(
            reads_file,
            output_h1,
            output_h2,
            output_untagged,
            stack,
        )

        timers.stop("split-init")

        histogram_data = {
            0: Counter(),
            1: Counter(),
            2: Counter(),
        }

        # holds count statistics about total processed reads etc.
        read_counter = Counter()

        process_haplotype = {
            0: output_untagged is not None or add_untagged,
            1: output_h1 is not None,
            2: output_h2 is not None,
        }

        timers.start("split-iter-input")

        for read_name, read_length, record in input_iterator(input_reader):
            read_counter["total_reads"] += 1
            if discard_unknown_reads and read_name not in known_reads:
                read_counter["unknown_reads"] += 1
                continue
            read_haplotype = readname_to_haplotype[read_name]
            if not process_haplotype[read_haplotype]:
                read_counter["skipped_reads"] += 1
                continue
            histogram_data[read_haplotype][read_length] += 1
            read_counter[read_haplotype] += 1

            output_writers[read_haplotype].write(record)
            if read_haplotype == 0 and add_untagged:
                output_writers[1].write(record)
                output_writers[2].write(record)

            if discard_unknown_reads:
                missing_reads -= 1
                if missing_reads == 0:
                    logger.info(
                        "All known reads processed - cancel processing...")
                    break

        timers.stop("split-iter-input")

        if read_lengths_histogram is not None:
            timers.start("split-length-histogram")
            write_read_length_histogram(histogram_data, read_lengths_histogram)
            timers.stop("split-length-histogram")

    timers.stop("split-run")

    logger.info("\n== SUMMARY ==")
    logger.info("Total reads processed: {}".format(
        read_counter["total_reads"]))
    logger.info('Number of output reads "untagged": {}'.format(
        read_counter[0]))
    logger.info("Number of output reads haplotype 1: {}".format(
        read_counter[1]))
    logger.info("Number of output reads haplotype 2: {}".format(
        read_counter[2]))
    logger.info("Number of unknown (dropped) reads: {}".format(
        read_counter["unknown_reads"]))
    logger.info("Number of skipped reads (per user request): {}".format(
        read_counter["skipped_reads"]))

    logger.info("Time for processing haplotag list: {} sec".format(
        round(timers.elapsed("split-process-haplotag-list"), 3)))

    logger.info("Time for total initial setup: {} sec".format(
        round(timers.elapsed("split-init"), 3)))

    logger.info("Time for iterating input reads: {} sec".format(
        round(timers.elapsed("split-iter-input"), 3)))

    if read_lengths_histogram is not None:
        logger.info("Time for creating histogram output: {} sec".format(
            round(timers.elapsed("split-length-histogram"), 3)))

    logger.info("Total run time: {} sec".format(
        round(timers.elapsed("split-run"), 3)))
コード例 #5
0
def run_genotype(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    max_coverage=15,
    nopriors=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    gt_qual_threshold=0,
    prioroutput=None,
    constant=0.0,
    overhang=10,
    affine_gap=False,
    gap_start=10,
    gap_extend=7,
    mismatch=15,
    write_command_line_header=True,
    use_ped_samples=False,
):
    """
    For now: this function only runs the genotyping algorithm. Genotype likelihoods for
    all variants are computed using the forward backward algorithm
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (genotyping) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None
    with ExitStack() as stack:
        # read the given input files (BAMs, VCFs, ref...)
        numeric_sample_ids = NumericSampleIds()
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
                overhang=overhang,
                affine=affine_gap,
                gap_start=gap_start,
                gap_extend=gap_extend,
                default_mismatch=mismatch,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # vcf writer for final genotype likelihoods
        vcf_writer = stack.enter_context(
            GenotypeVcfWriter(command_line=command_line,
                              in_path=variant_file,
                              out_file=output))
        # vcf writer for only the prior likelihoods (if output is desired)
        prior_vcf_writer = None
        if prioroutput is not None:
            prior_vcf_writer = stack.enter_context(
                GenotypeVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=stack.enter_context(open(prioroutput, "w")),
                ))

        # parse vcf with input variants
        # remove all likelihoods that may already be present
        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                genotype_likelihoods=False,
                ignore_genotypes=True,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = set()
            for trio in PedReader(ped):
                if trio.child is None or trio.mother is None or trio.father is None:
                    continue
                samples.add(trio.mother)
                samples.add(trio.father)
                samples.add(trio.child)

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            recombination_cost_computer = GeneticMapRecombinationCostComputer(
                genmap)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        # Read phase information provided as VCF files, if provided.
        with timers("parse_phasing_vcfs"):
            phased_input_reader.read_vcfs()

        # compute genotype likelihood threshold
        gt_prob = 1.0 - (10**(-gt_qual_threshold / 10.0))

        for variant_table in timers.iterate("parse_vcf", vcf_reader):

            # create a mapping of genome positions to indices
            var_to_pos = dict()
            for i in range(len(variant_table.variants)):
                var_to_pos[variant_table.variants[i].position] = i

            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                vcf_writer.write_genotypes(chromosome,
                                           variant_table,
                                           indels,
                                           leave_unchanged=True)
                if prioroutput is not None:
                    prior_vcf_writer.write_genotypes(chromosome,
                                                     variant_table,
                                                     indels,
                                                     leave_unchanged=True)
                continue

            positions = [v.position for v in variant_table.variants]
            if not nopriors:
                # compute prior genotype likelihoods based on all reads
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                            read_vcf=False,
                        )
                        readset.sort()
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        # recompute genotypes based on given threshold
                        reg_genotype_likelihoods = []
                        for gl in range(len(genotype_likelihoods)):
                            norm_sum = (genotype_likelihoods[gl][0] +
                                        genotype_likelihoods[gl][1] +
                                        genotype_likelihoods[gl][2] +
                                        3 * constant)
                            regularized = PhredGenotypeLikelihoods([
                                (genotype_likelihoods[gl][0] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][1] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][2] + constant) /
                                norm_sum,
                            ])
                            genotypes[gl] = determine_genotype(
                                regularized, gt_prob)
                            assert isinstance(genotypes[gl], Genotype)
                            reg_genotype_likelihoods.append(regularized)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                PhredGenotypeLikelihoods(list(gl))
                                for gl in reg_genotype_likelihoods
                            ],
                        )
                        variant_table.set_genotypes_of(sample, genotypes)
            else:

                # use uniform genotype likelihoods for all individuals
                for sample in samples:
                    variant_table.set_genotype_likelihoods_of(
                        sample,
                        [PhredGenotypeLikelihoods([1 / 3, 1 / 3, 1 / 3])] *
                        len(positions),
                    )

            # if desired, output the priors in separate vcf
            if prioroutput is not None:
                prior_vcf_writer.write_genotypes(chromosome, variant_table,
                                                 indels)

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert (len(family) == 1) or (len(trios) > 0)

                # Get the reads belonging to each sample
                readsets = dict()
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                        )

                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        selected_reads = select_reads(
                            readset,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )
                    readsets[sample] = selected_reads

                # Merge reads into one ReadSet (note that each Read object
                # knows the sample it originated from).
                all_reads = ReadSet()
                for sample, readset in readsets.items():
                    for read in readset:
                        assert read.is_sorted(), "Add a read.sort() here"
                        all_reads.add(read)

                all_reads.sort()

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )

                # Create Pedigree
                pedigree = Pedigree(numeric_sample_ids)
                for sample in family:
                    # genotypes are assumed to be unknown, so ignore information that
                    # might already be present in the input vcf
                    all_genotype_likelihoods = variant_table.genotype_likelihoods_of(
                        sample)
                    genotype_l = [
                        all_genotype_likelihoods[var_to_pos[a_p]]
                        for a_p in accessible_positions
                    ]
                    pedigree.add_individual(
                        sample,
                        [
                            Genotype([])
                            for i in range(len(accessible_positions))
                        ],
                        genotype_l,
                    )
                for trio in trios:
                    pedigree.add_relationship(
                        father_id=trio.father,
                        mother_id=trio.mother,
                        child_id=trio.child,
                    )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run genotyping algorithm
                with timers("genotyping"):
                    problem_name = "genotyping"
                    logger.info(
                        "Genotype %d sample%s by solving the %s problem ...",
                        len(family),
                        "s" if len(family) > 1 else "",
                        problem_name,
                    )
                    forward_backward_table = GenotypeDPTable(
                        numeric_sample_ids,
                        all_reads,
                        recombination_costs,
                        pedigree,
                        accessible_positions,
                    )
                    # store results
                    for s in family:
                        likelihood_list = variant_table.genotype_likelihoods_of(
                            s)
                        genotypes_list = variant_table.genotypes_of(s)

                        for pos in range(len(accessible_positions)):
                            likelihoods = forward_backward_table.get_genotype_likelihoods(
                                s, pos)

                            # compute genotypes from likelihoods and store information
                            geno = determine_genotype(likelihoods, gt_prob)
                            assert isinstance(geno, Genotype)
                            genotypes_list[var_to_pos[
                                accessible_positions[pos]]] = geno
                            likelihood_list[var_to_pos[
                                accessible_positions[pos]]] = likelihoods

                        variant_table.set_genotypes_of(s, genotypes_list)
                        variant_table.set_genotype_likelihoods_of(
                            s, likelihood_list)

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                vcf_writer.write_genotypes(chromosome, variant_table, indels)
                logger.info("Done writing VCF")

            logger.debug("Chromosome %r finished", chromosome)

    logger.info("\n== SUMMARY ==")
    total_time = timers.total()
    log_memory_usage()
    logger.info(
        "Time spent reading BAM:                      %6.1f s",
        timers.elapsed("read_bam"),
    )
    logger.info(
        "Time spent parsing VCF:                      %6.1f s",
        timers.elapsed("parse_vcf"),
    )
    if show_phase_vcfs:
        logger.info(
            "Time spent parsing input phasings from VCFs: %6.1f s",
            timers.elapsed("parse_phasing_vcfs"),
        )
    logger.info("Time spent selecting reads:                  %6.1f s",
                timers.elapsed("select"))
    logger.info(
        "Time spent genotyping:                          %6.1f s",
        timers.elapsed("genotyping"),
    )
    logger.info(
        "Time spent writing VCF:                      %6.1f s",
        timers.elapsed("write_vcf"),
    )
    logger.info(
        "Time spent on rest:                          %6.1f s",
        total_time - timers.sum(),
    )
    logger.info("Total elapsed time:                          %6.1f s",
                total_time)
コード例 #6
0
ファイル: haplotag.py プロジェクト: adamnovak/gwhatshap
def run_haplotag(variant_file, alignment_file, output=None, reference=None):
	timers = StageTimer()
	timers.start('overall')

	with ExitStack() as stack:
		numeric_sample_ids = NumericSampleIds()
		try:
			readset_reader = stack.enter_context(ReadSetReader([alignment_file], numeric_sample_ids, mapq_threshold=0))
		except (OSError, BamIndexingError) as e:
			logger.error(e)
			sys.exit(1)
		if reference:
			try:
				fasta = stack.enter_context(pyfaidx.Fasta(reference, as_raw=True))
			except OSError as e:
				logger.error('%s', e)
				sys.exit(1)
		else:
			fasta = None

		vcf_reader = VcfReader(variant_file, indels=True, phases=True)
		vcf_samples = set(vcf_reader.samples)
		logger.info('Found %d samples in VCF file', len(vcf_samples))

		bam_reader = pysam.AlignmentFile(alignment_file)
		read_groups = bam_reader.header.get('RG', []) 
		bam_samples = set( (rg['SM'] if 'SM' in rg else None) for rg in read_groups )
		rg_to_sample = { rg['ID']:rg['SM'] for rg in read_groups if ('ID' in rg) and ('SM' in rg) }
		logger.info('Samples in BAM file: %s', ','.join(bam_samples))
		samples = bam_samples.intersection(vcf_samples)
		if len(samples) == 0:
			logger.error('No common samples between VCF and BAM file. Aborting.')
			sys.exit(1)
		elif len(samples) < len(bam_samples):
			logger.warning('Not adding phase information for sample(s) %s to BAM file, since they are not present in the VCF', ','.join(bam_samples.difference(vcf_samples)))

		# Prepare header
		# TODO: convince pysam to allow @HS header line
		header = bam_reader.header
		command_line = ' '.join(['whatshap'] + sys.argv[1:])
		PG_entry = { 'PN':'whatshap', 'VN':__version__, 'CL':command_line, 'm5': md5_of(variant_file)}
		if 'PG' in header:
			header['PG'].append(PG_entry)
		else:
			header['PG'] = [PG_entry]
		if output:
			bam_writer = pysam.AlignmentFile(output, 'wb', header=header)
		else:
			bam_writer = pysam.AlignmentFile('-', 'wb', header=header)

		chromosome_name = None
		chromosome_id = None
		skipped_vcf_chromosomes = set()
		vcf_iter = iter(vcf_reader)
		n_alignments = 0
		n_tagged = 0
		n_multiple_phase_sets = 0
		for alignment in bam_reader:
			n_alignments += 1
			alignment.set_tag('HP', value=None)
			alignment.set_tag('PC', value=None)
			alignment.set_tag('PS', value=None)
			if not alignment.is_unmapped:
				# Has chromosome changed?
				if chromosome_id != alignment.reference_id:
					chromosome_id = alignment.reference_id
					chromosome_name = alignment.reference_name
					logger.info('Processing alignments on chromosome %s', chromosome_name)
					if chromosome_name in skipped_vcf_chromosomes:
						logger.error('Chromosome records in alignment file and VCF are sorted differently.')
						sys.exit(1)
					# Read information on this chromsome from VCF
					while True:
						variant_table = next(vcf_iter, None)
						if variant_table is None:
							break
						if variant_table.chromosome == chromosome_name:
							logger.info('... found %s variants chromosome %s in VCF', len(variant_table), chromosome_name)
							break
						else:
							skipped_vcf_chromosomes.add(variant_table.chromosome)
					# maps read name to (haplotype, quality, phaseset)
					read_to_haplotype = {}
					# Read all reads for this chromosome once to create one core.ReadSet per sample
					# this allows to assign phase to paired-end reads based on both reads
					if variant_table is not None:
						for sample in samples:
							genotypes = variant_table.genotypes_of(sample)
							phases = variant_table.phases_of(sample)
							variantpos_to_phaseset = {
								v.position:int(phases[i].block_id) for i,v in enumerate(variant_table.variants) if phases[i] is not None
							}
							variants = [
								v for v, gt, phase in zip(variant_table.variants, genotypes, phases) if gt == 1 and phase is not None
							]
							read_set = read_reads(readset_reader, chromosome_name, variants, sample, fasta)
							for read in read_set:
								# mapping: phaseset --> phred scaled difference between costs of assigning reads to haplotype 0 or 1
								haplotype_costs = defaultdict(int)
								for v in read:
									assert v.allele in [0,1]
									phaseset = variantpos_to_phaseset[v.position]
									if v.allele == 0:
										haplotype_costs[phaseset] += v.quality
									else:
										haplotype_costs[phaseset] -= v.quality
								l = list(haplotype_costs.items())
								l.sort(key=lambda t:-abs(t[1]))
								#logger.info('Read %s: %s', read.name, str(l))
								if len(l) > 0:
									if len(l) > 1:
										n_multiple_phase_sets += 1
									phaseset, quality = l[0]
									if quality != 0:
										haplotype = 0 if quality > 0 else 1
										read_to_haplotype[read.name] = (haplotype, abs(quality), phaseset)
										#logger.debug('Assigned read %s to haplotype %d with a quality of %d based on %d covered variants', read.name, haplotype, quality, len(read))

				# Only attempt to assign phase of neither secondary nor supplementary
				if (not alignment.is_secondary) and (alignment.flag & 2048 == 0):
					try:
						haplotype, quality, phaseset = read_to_haplotype[alignment.query_name]
						alignment.set_tag('HP', haplotype + 1)
						alignment.set_tag('PC', quality)
						alignment.set_tag('PS', phaseset)
						n_tagged += 1
					except KeyError:
						pass
			bam_writer.write(alignment)
			if n_alignments % 100000 == 0:
				logger.info('Processed %d alignment records.', n_alignments)

	logger.info('\n== SUMMARY ==')
	logger.info('Total alignments processed:              %12d', n_alignments)
	logger.info('Alignments that could be tagged:         %12d', n_tagged)
	logger.info('Alignments spanning multiple phase sets: %12d', n_multiple_phase_sets)
	bam_writer.close()
コード例 #7
0
ファイル: haplotag.py プロジェクト: sarangian/WHdenovo
def run_haplotag(variant_file,
                 alignment_file,
                 output=None,
                 reference=None,
                 ignore_linked_read=False,
                 given_samples=None,
                 linked_read_distance_cutoff=50000,
                 ignore_read_groups=False):

    timers = StageTimer()
    timers.start('overall')

    with ExitStack() as stack:
        numeric_sample_ids = NumericSampleIds()
        try:
            readset_reader = stack.enter_context(
                ReadSetReader([alignment_file],
                              reference=reference,
                              numeric_sample_ids=numeric_sample_ids,
                              mapq_threshold=0))
        except OSError as e:
            logger.error(e)
            sys.exit(1)
        except AlignmentFileNotIndexedError as e:
            logger.error(
                'The file %r is not indexed. Please create the appropriate BAM/CRAM '
                'index with "samtools index"', str(e))
            sys.exit(1)

        if reference:
            try:
                fasta = stack.enter_context(IndexedFasta(reference))
            except OSError as e:
                logger.error('%s', e)
                sys.exit(1)
            except FastaNotIndexedError as e:
                logger.error(
                    'An index file (.fai) for the reference %r could not be found. '
                    'Please create one with "samtools faidx".', str(e))
                sys.exit(1)
        else:
            fasta = None

        # require input VCF to be compressed
        if not variant_file.endswith('gz'):
            logger.error('The input VCF must be compressed (vcf.gz).')
            sys.exit(1)

        vcf_reader = VcfReader(variant_file, indels=True, phases=True)
        vcf_samples = set(vcf_reader.samples)
        logger.info('Found %d samples in VCF file', len(vcf_samples))

        # determine which samples to consider
        if ignore_read_groups and not given_samples and len(
                vcf_reader.samples) > 1:
            logger.error('When using --ignore-read-groups on a VCF with '
                         'multiple samples, --sample must also be used.')
            sys.exit(1)

        if not given_samples:
            given_samples = vcf_reader.samples

        for sample in given_samples:
            if sample not in vcf_samples:
                logger.error(
                    'Sample %r requested on command-line not found in VCF',
                    sample)
                sys.exit(1)

        # keep only requested samples
        vcf_samples = vcf_samples.intersection(given_samples)

        # determine which samples are in BAM file
        bam_reader = pysam.AlignmentFile(alignment_file)
        read_groups = bam_reader.header.get('RG', [])
        bam_samples = set(
            (rg['SM'] if 'SM' in rg else None) for rg in read_groups)
        rg_to_sample = {
            rg['ID']: rg['SM']
            for rg in read_groups if ('ID' in rg) and ('SM' in rg)
        }
        logger.info('Samples in BAM file: %s', ','.join(bam_samples))
        samples = vcf_samples
        if not ignore_read_groups:
            samples = bam_samples.intersection(vcf_samples)
            if len(samples) == 0:
                logger.error(
                    'No common samples between VCF and BAM file. Aborting.')
                sys.exit(1)
            elif len(samples) < len(bam_samples):
                logger.warning(
                    'Not adding phase information for sample(s) %s to BAM file, since they are not present in the VCF or were not given using --sample.',
                    ','.join(bam_samples.difference(vcf_samples)))
        else:
            if len(samples) == 0:
                logger.error(
                    'No samples present in VCF. In case --sample was used, the requested sample(s) are not present in the VCF. Aborting.'
                )
                sys.exit(1)

        # Prepare header
        # TODO: convince pysam to allow @HS header line
        header = bam_reader.header.to_dict()
        command_line = ' '.join(['whatshap'] + sys.argv[1:])
        PG_entry = {
            'ID': 'whatshap',
            'PN': 'whatshap',
            'VN': __version__,
            'CL': command_line,
            'm5': md5_of(variant_file)
        }
        if 'PG' in header:
            header['PG'].append(PG_entry)
        else:
            header['PG'] = [PG_entry]
        if output:
            bam_writer = pysam.AlignmentFile(
                output, 'wb', header=pysam.AlignmentHeader.from_dict(header))
        else:
            bam_writer = pysam.AlignmentFile(
                '-', 'wb', header=pysam.AlignmentHeader.from_dict(header))

        chromosome_name = None
        chromosome_id = None
        skipped_vcf_chromosomes = set()
        n_alignments = 0
        n_tagged = 0
        n_multiple_phase_sets = 0

        # map BX tag to assigned haplotype
        BX_tag_to_haplotype = defaultdict(list)

        for alignment in bam_reader:
            n_alignments += 1
            alignment.set_tag('HP', value=None)
            alignment.set_tag('PC', value=None)
            alignment.set_tag('PS', value=None)
            if not alignment.is_unmapped:
                # Has chromosome changed?
                if chromosome_id != alignment.reference_id:
                    chromosome_id = alignment.reference_id
                    chromosome_name = alignment.reference_name
                    BX_tag_to_haplotype = defaultdict(list)
                    logger.info('Processing alignments on chromosome %s',
                                chromosome_name)
                    # Read information on this chromsome from VCF
                    variant_table = None
                    try:
                        variant_table = vcf_reader._fetch(chromosome_name)
                        logger.info(
                            '... found %s variants for chromosome %s in VCF',
                            len(variant_table), chromosome_name)
                    except OSError as e:
                        logger.error(str(e))
                        sys.exit(1)
                    except ValueError:
                        logger.info(
                            'No variants given for chromosome {} in the input VCF.'
                            .format(chromosome_name))

                    # maps read name to (haplotype, quality, phaseset)
                    read_to_haplotype = {}
                    # Read all reads for this chromosome once to create one core.ReadSet per sample
                    # this allows to assign phase to paired-end reads based on both reads
                    if variant_table is not None:
                        for sample in samples:
                            genotypes = variant_table.genotypes_of(sample)
                            phases = variant_table.phases_of(sample)
                            variantpos_to_phaseinfo = {
                                v.position:
                                (int(phases[i].block_id), phases[i].phase)
                                for i, v in enumerate(variant_table.variants)
                                if phases[i] is not None
                            }
                            variants = [
                                v for v, gt, phase in zip(
                                    variant_table.variants, genotypes, phases)
                                if gt == 1 and phase is not None
                            ]
                            bam_sample = None if ignore_read_groups else sample
                            read_set = read_reads(readset_reader,
                                                  chromosome_name, variants,
                                                  bam_sample, fasta)

                            # map tag --> set of reads
                            BX_tag_to_readlist = defaultdict(list)
                            for read in read_set:
                                if read.has_BX_tag():
                                    BX_tag_to_readlist[read.BX_tag].append(
                                        read)
                            # all reads processed so far
                            processed_reads = set()
                            for read in read_set:
                                if read.name in processed_reads:
                                    continue
                                # mapping: phaseset --> phred scaled difference between costs of assigning reads to haplotype 0 or 1
                                haplotype_costs = defaultdict(int)
                                reads_to_consider = set()

                                processed_reads.add(read.name)
                                reads_to_consider.add(read)

                                # reads with same BX tag need to be considered too (unless --ignore-linked-read is set)
                                if read.has_BX_tag(
                                ) and not ignore_linked_read:
                                    for r in BX_tag_to_readlist[read.BX_tag]:
                                        if not r.name in processed_reads:
                                            # only select reads close to current one
                                            if abs(
                                                    read.reference_start -
                                                    r.reference_start
                                            ) <= linked_read_distance_cutoff:
                                                reads_to_consider.add(r)
                                for r in reads_to_consider:
                                    processed_reads.add(r.name)
                                    for v in r:
                                        assert v.allele in [0, 1]
                                        phaseset, allele = variantpos_to_phaseinfo[
                                            v.position]
                                        if v.allele == allele:
                                            haplotype_costs[
                                                phaseset] += v.quality
                                        else:
                                            haplotype_costs[
                                                phaseset] -= v.quality

                                l = list(haplotype_costs.items())
                                l.sort(key=lambda t: -abs(t[1]))
                                #logger.info('Read %s: %s', read.name, str(l))
                                if len(l) > 0:
                                    if len(l) > 1:
                                        n_multiple_phase_sets += 1
                                    phaseset, quality = l[0]
                                    if quality != 0:
                                        haplotype = 0 if quality > 0 else 1
                                        BX_tag_to_haplotype[
                                            read.BX_tag].append(
                                                (read.reference_start,
                                                 haplotype, phaseset))
                                        for r in reads_to_consider:
                                            read_to_haplotype[r.name] = (
                                                haplotype, abs(quality),
                                                phaseset)
                                            logger.debug(
                                                'Assigned read %s to haplotype %d with a quality of %d based on %d covered variants',
                                                r.name, haplotype, quality,
                                                len(r))

                # Only attempt to assign phase of neither secondary nor supplementary
                if (not alignment.is_secondary) and (alignment.flag & 2048
                                                     == 0):
                    try:
                        haplotype, quality, phaseset = read_to_haplotype[
                            alignment.query_name]
                        alignment.set_tag('HP', haplotype + 1)
                        alignment.set_tag('PC', quality)
                        alignment.set_tag('PS', phaseset)
                        n_tagged += 1
                    except KeyError:
                        # check if reads with same tag have been assigned
                        if alignment.has_tag('BX'):
                            read_clouds = BX_tag_to_haplotype[
                                alignment.get_tag('BX')]
                            for (reference_start, haplotype,
                                 phaseset) in read_clouds:
                                if abs(reference_start -
                                       alignment.reference_start
                                       ) <= linked_read_distance_cutoff:
                                    alignment.set_tag('HP', haplotype + 1)
                                    alignment.set_tag('PS', phaseset)
                                    n_tagged += 1
                                    break
            bam_writer.write(alignment)
            if n_alignments % 100000 == 0:
                logger.info('Processed %d alignment records.', n_alignments)

    logger.info('\n== SUMMARY ==')
    logger.info('Total alignments processed:              %12d', n_alignments)
    logger.info('Alignments that could be tagged:         %12d', n_tagged)
    logger.info('Alignments spanning multiple phase sets: %12d',
                n_multiple_phase_sets)
    bam_writer.close()
コード例 #8
0
def run_whatshap(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    read_merging=False,
    read_merging_error_rate=0.15,
    read_merging_max_error_rate=0.25,
    read_merging_positive_threshold=1000000,
    read_merging_negative_threshold=1000,
    max_coverage=15,
    full_genotyping=False,
    distrust_genotypes=False,
    include_homozygous=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    genetic_haplotyping=True,
    recombination_list_filename=None,
    tag="PS",
    read_list_filename=None,
    gl_regularizer=None,
    gtchange_list_filename=None,
    default_gq=30,
    write_command_line_header=True,
    use_ped_samples=False,
    algorithm="whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    full_genotyping
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        "This is WhatsHap %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if full_genotyping:
        distrust_genotypes = True
        include_homozygous = True
    numeric_sample_ids = NumericSampleIds()
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            try:
                recombination_cost_computer = GeneticMapRecombinationCostComputer(
                    genmap)
            except ParseError as e:
                raise CommandLineError(e)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            if full_genotyping:
                positions = [v.position for v in variant_table.variants]
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        bam_sample = None if ignore_read_groups else sample
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            bam_sample,
                            read_vcf=False,
                        )
                        readset.sort()  # TODO can be removed
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        variant_table.set_genotypes_of(sample, genotypes)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                GenotypeLikelihoods(gl)
                                for gl in genotype_likelihoods
                            ],
                        )

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            phasable_variant_table.variants,
                            sample,
                        )

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    optimal_cost = dp_table.get_optimal_cost()
                    logger.info("%s cost: %d", problem_name, optimal_cost)

                with timers("components"):
                    master_block = None
                    heterozygous_positions_by_sample = None
                    # If we distrusted genotypes, we need to re-determine which sites are h**o-/heterozygous after phasing
                    if distrust_genotypes:
                        hom_in_any_sample = set()
                        heterozygous_positions_by_sample = {}
                        heterozygous_gts = frozenset({(0, 1), (1, 0)})
                        homozygous_gts = frozenset({(0, 0), (1, 1)})
                        for sample, sample_superreads in zip(
                                family, superreads_list):
                            hets = set()
                            for v1, v2 in zip(*sample_superreads):
                                assert v1.position == v2.position
                                if v1.position not in accessible_positions:
                                    continue
                                gt = (v1.allele, v2.allele)
                                if gt in heterozygous_gts:
                                    hets.add(v1.position)
                                elif gt in homozygous_gts:
                                    hom_in_any_sample.add(v1.position)
                            heterozygous_positions_by_sample[
                                numeric_sample_ids[sample]] = hets
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(hom_in_any_sample)
                    else:
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(
                                set(homozygous_positions).intersection(
                                    set(accessible_positions)))
                    overall_components = find_components(
                        accessible_positions,
                        all_reads,
                        master_block,
                        heterozygous_positions_by_sample,
                    )
                    n_phased_blocks = len(set(overall_components.values()))
                    logger.info("No. of phased blocks: %d", n_phased_blocks)
                    largest_component = find_largest_component(
                        overall_components)
                    if len(largest_component) > 0:
                        logger.info(
                            "Largest component contains %d variants (%.1f%% of accessible variants) between position %d and %d",
                            len(largest_component),
                            len(largest_component) * 100.0 /
                            len(accessible_positions),
                            largest_component[0] + 1,
                            largest_component[-1] + 1,
                        )

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations,
                    )

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)