Python PhasedVcfWriter Exemples, whatshap.vcf.PhasedVcfWriter Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : hapcut2vcf.py Projet : erikvdp/whatshap-TLA

def run_hapcut2vcf(hapcut, vcf, output=sys.stdout):
    command_line = "(whatshap {}) {}".format(__version__,
                                             " ".join(sys.argv[1:]))
    with ExitStack() as stack:
        if isinstance(output, str):
            output = stack.enter_context(open(output, "w"))

        writer = PhasedVcfWriter(vcf, command_line, out_file=output)
        if len(writer.samples) > 1:
            # This would be easy to support with a --sample command-line parameter,
            # but hapCUT does not seem to support multi-sample VCFs, so something
            # must be wrong anyway.
            raise CommandLineError("There is more than one sample in this VCF")
        sample = writer.samples[0]

        f = stack.enter_context(open(hapcut))
        parser = HapCutParser(f)
        for chromosome, blocks in parser:
            logger.info("Read %d phased blocks for chromosome %s", len(blocks),
                        chromosome)

            # Build one read for each haplotype and the connected components
            haplotypes = [Read(str(i)) for i in (1, 2)]
            components = dict()
            for block in blocks:
                for variant in block:
                    haplotypes[0].add_variant(variant.position,
                                              variant.haplotype1, 0)
                    haplotypes[1].add_variant(variant.position,
                                              variant.haplotype2, 0)
                    components[variant.position] = variant.component_id

            sample_superreads = {sample: haplotypes}
            sample_components = {sample: components}
            writer.write(chromosome, sample_superreads, sample_components)

Exemple #2

0

Afficher le fichier

def run_whatshap(
    phase_input_files: List[str],
    variant_file: str,
    reference: Union[None, bool, str] = False,
    output: TextIO = sys.stdout,
    samples: List[str] = None,
    chromosomes: Optional[List[str]] = None,
    ignore_read_groups: bool = False,
    indels: bool = True,
    mapping_quality: int = 20,
    read_merging: bool = False,
    read_merging_error_rate: float = 0.15,
    read_merging_max_error_rate: float = 0.25,
    read_merging_positive_threshold: int = 1000000,
    read_merging_negative_threshold: int = 1000,
    max_coverage: int = 15,
    distrust_genotypes: bool = False,
    include_homozygous: bool = False,
    ped: Optional[str] = None,
    recombrate: float = 1.26,
    genmap: Optional[str] = None,
    genetic_haplotyping: bool = True,
    recombination_list_filename: Optional[str] = None,
    tag: str = "PS",
    read_list_filename: Optional[str] = None,
    gl_regularizer: Optional[float] = None,
    gtchange_list_filename: Optional[str] = None,
    default_gq: int = 30,
    write_command_line_header: bool = True,
    use_ped_samples: bool = False,
    algorithm: str = "whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA. If False: skip realignment. If None: complain if reference needed.
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        f"This is WhatsHap {__version__} running under Python {platform.python_version()}"
    )
    numeric_sample_ids = NumericSampleIds()
    command_line: Optional[str]
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    read_merger: ReadMergerBase
    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    indels=indels,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                None if reference is False else reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        if phased_input_reader.has_alignments and reference is None:
            raise CommandLineError(
                "A reference FASTA needs to be provided with -r/--reference; "
                "or use --no-reference at the expense of phasing quality.")

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        recombination_cost_computer = make_recombination_cost_computer(
            ped, genmap, recombrate)

        families, family_trios = setup_families(samples, ped, max_coverage)
        del samples
        for trios in family_trios.values():
            for trio in trios:
                # Ensure that all mentioned individuals have a numeric id
                _ = numeric_sample_ids[trio.child]

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        superreads: Dict[str, ReadSet]
        components: Dict
        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset))
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )
                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    dp_table: Union[HapChatCore, PedigreeDPTable]
                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    logger.info("%s cost: %d", problem_name,
                                dp_table.get_optimal_cost())

                with timers("components"):
                    overall_components = compute_overall_components(
                        accessible_positions,
                        all_reads,
                        distrust_genotypes,
                        family,
                        genetic_haplotyping,
                        homozygous_positions,
                        numeric_sample_ids,
                        superreads_list,
                    )
                    log_component_stats(overall_components,
                                        len(accessible_positions))

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations)

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)

Exemple #3

0

Afficher le fichier

def run_polyphase(
    phase_input_files,
    variant_file,
    ploidy,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    verify_genotypes=False,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    tag="PS",
    include_haploid_sets=False,
    write_command_line_header=True,
    read_list_filename=None,
    ce_bundle_edges=False,
    min_overlap=2,
    plot_clusters=False,
    plot_threading=False,
    ce_refinements=5,
    block_cut_sensitivity=4,
):
    """
    Run Polyploid Phasing.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant-file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file like object
    samples -- names of samples to phase. An empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. An empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (polyploid) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    numeric_sample_ids = NumericSampleIds()
    with ExitStack() as stack:
        assert phase_input_files
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
            ))
        assert not phased_input_reader.has_vcfs

        if write_command_line_header:
            command_line = "(whatshap {}) {}".format(__version__,
                                                     " ".join(sys.argv[1:]))
        else:
            command_line = None
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    ploidy=ploidy,
                    include_haploid_sets=include_haploid_sets,
                ))
        except OSError as e:
            raise CommandLineError(e)

        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                phases=True,
                genotype_likelihoods=False,
                ploidy=ploidy,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if block_cut_sensitivity < 0:
            logger.warning(
                "Block cut sensitivity was set to negative value. Lowest value (0) is assumed instead."
            )
            block_cut_sensitivity = 0
        elif block_cut_sensitivity > 5:
            logger.warning(
                "Block cut sensitivity level too large. Assuming highest valid value (5) instead."
            )
            block_cut_sensitivity = 5

        samples = frozenset(samples)

        read_list_file = None
        if read_list_filename:
            raise NotImplementedError("create_read_list_file not implemented")
            # read_list_file = create_read_list_file(read_list_filename)

        # Store phasing parameters in tuple to keep function signatures cleaner
        phasing_param = PhasingParameter(
            ploidy=ploidy,
            verify_genotypes=verify_genotypes,
            ce_bundle_edges=ce_bundle_edges,
            min_overlap=min_overlap,
            ce_refinements=ce_refinements,
            block_cut_sensitivity=block_cut_sensitivity,
            plot_clusters=plot_clusters,
            plot_threading=plot_threading,
        )

        timers.start("parse_vcf")
        try:
            for variant_table in vcf_reader:
                chromosome = variant_table.chromosome
                timers.stop("parse_vcf")
                if (not chromosomes) or (chromosome in chromosomes):
                    logger.info("======== Working on chromosome %r",
                                chromosome)
                else:
                    logger.info(
                        "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                        chromosome,
                    )
                    with timers("write_vcf"):
                        superreads, components = dict(), dict()
                        vcf_writer.write(chromosome, superreads, components)
                    continue

                # These two variables hold the phasing results for all samples
                superreads, components, haploid_components = dict(), dict(
                ), dict()

                # Iterate over all samples to process
                for sample in samples:
                    logger.info("---- Processing individual %s", sample)

                    # Process inputs for this sample
                    missing_genotypes = set()
                    heterozygous = set()

                    genotypes = variant_table.genotypes_of(sample)
                    for index, gt in enumerate(genotypes):
                        if gt.is_none():
                            missing_genotypes.add(index)
                        elif not gt.is_homozygous():
                            heterozygous.add(index)
                        else:
                            assert gt.is_homozygous()
                    to_discard = set(range(
                        len(variant_table))).difference(heterozygous)
                    phasable_variant_table = deepcopy(variant_table)
                    # Remove calls to be discarded from variant table
                    phasable_variant_table.remove_rows_by_index(to_discard)

                    logger.info(
                        "Number of variants skipped due to missing genotypes: %d",
                        len(missing_genotypes),
                    )
                    logger.info(
                        "Number of remaining heterozygous variants: %d",
                        len(phasable_variant_table))

                    # Get the reads belonging to this sample
                    timers.start("read_bam")
                    readset, vcf_source_ids = phased_input_reader.read(
                        chromosome, phasable_variant_table.variants, sample)
                    readset.sort()
                    timers.stop("read_bam")

                    # Verify genotypes
                    if verify_genotypes:
                        timers.start("verify_genotypes")
                        logger.info("Verify genotyping of %s", sample)
                        positions = [
                            v.position for v in phasable_variant_table.variants
                        ]
                        computed_genotypes = [
                            Genotype(gt) for gt in compute_polyploid_genotypes(
                                readset, ploidy, positions)
                        ]
                        # skip all positions at which genotypes do not match
                        given_genotypes = phasable_variant_table.genotypes_of(
                            sample)
                        matching_genotypes = []
                        missing_genotypes = set()
                        print(computed_genotypes, len(computed_genotypes))
                        print(given_genotypes, len(given_genotypes))
                        print(len(positions))
                        for i, g in enumerate(given_genotypes):
                            c_g = computed_genotypes[i]
                            if (g == c_g) or (c_g is None):
                                matching_genotypes.append(g)
                            else:
                                matching_genotypes.append(Genotype([]))
                                missing_genotypes.add(i)
                        phasable_variant_table.set_genotypes_of(
                            sample, matching_genotypes)

                        # Remove variants with deleted genotype
                        phasable_variant_table.remove_rows_by_index(
                            missing_genotypes)
                        logger.info(
                            "Number of variants removed due to inconsistent genotypes: %d",
                            len(missing_genotypes),
                        )
                        logger.info(
                            "Number of remaining heterozygous variants: %d",
                            len(phasable_variant_table),
                        )

                        # Re-read the readset to remove discarded variants
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)
                        readset.sort()
                        timers.stop("verify_genotypes")

                    # Remove reads with insufficient variants
                    readset = readset.subset([
                        i for i, read in enumerate(readset)
                        if len(read) >= max(2, min_overlap)
                    ])
                    logger.info(
                        "Kept %d reads that cover at least two variants each",
                        len(readset))

                    # Adapt the variant table to the subset of reads
                    phasable_variant_table.subset_rows_by_position(
                        readset.get_positions())

                    # Run the actual phasing
                    (
                        sample_components,
                        sample_haploid_components,
                        sample_superreads,
                    ) = phase_single_individual(readset,
                                                phasable_variant_table, sample,
                                                phasing_param, output, timers)

                    # Collect results
                    components[sample] = sample_components
                    haploid_components[sample] = sample_haploid_components
                    superreads[sample] = sample_superreads

                with timers("write_vcf"):
                    logger.info("======== Writing VCF")
                    vcf_writer.write(
                        chromosome,
                        superreads,
                        components,
                        haploid_components if include_haploid_sets else None,
                    )
                    # TODO: Use genotype information to polish results
                    # assert len(changed_genotypes) == 0
                    logger.info("Done writing VCF")
                logger.debug("Chromosome %r finished", chromosome)
                timers.start("parse_vcf")
            timers.stop("parse_vcf")
        except PloidyError as e:
            raise CommandLineError(e)

    if read_list_file:
        read_list_file.close()

    logger.info("\n== SUMMARY ==")

    log_memory_usage()
    logger.info("Time spent reading BAM/CRAM:                 %6.1f s",
                timers.elapsed("read_bam"))
    logger.info("Time spent parsing VCF:                      %6.1f s",
                timers.elapsed("parse_vcf"))
    if verify_genotypes:
        logger.info(
            "Time spent verifying genotypes:              %6.1f s",
            timers.elapsed("verify_genotypes"),
        )
    logger.info("Time spent detecting blocks:                 %6.1f s",
                timers.elapsed("detecting_blocks"))
    logger.info("Time spent scoring reads:                    %6.1f s",
                timers.elapsed("read_scoring"))
    logger.info(
        "Time spent solving cluster editing:          %6.1f s",
        timers.elapsed("solve_clusterediting"),
    )
    logger.info("Time spent threading haplotypes:             %6.1f s",
                timers.elapsed("threading"))
    if plot_clusters or plot_threading:
        logger.info("Time spent creating plots:                   %6.1f s",
                    timers.elapsed("create_plots"))
    logger.info("Time spent writing VCF:                      %6.1f s",
                timers.elapsed("write_vcf"))
    logger.info("Time spent on rest:                          %6.1f s",
                timers.total() - timers.sum())
    logger.info("Total elapsed time:                          %6.1f s",
                timers.total())

Exemple #4

0

Afficher le fichier

def run_whatshap(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    read_merging=False,
    read_merging_error_rate=0.15,
    read_merging_max_error_rate=0.25,
    read_merging_positive_threshold=1000000,
    read_merging_negative_threshold=1000,
    max_coverage=15,
    full_genotyping=False,
    distrust_genotypes=False,
    include_homozygous=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    genetic_haplotyping=True,
    recombination_list_filename=None,
    tag="PS",
    read_list_filename=None,
    gl_regularizer=None,
    gtchange_list_filename=None,
    default_gq=30,
    write_command_line_header=True,
    use_ped_samples=False,
    algorithm="whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    full_genotyping
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        "This is WhatsHap %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if full_genotyping:
        distrust_genotypes = True
        include_homozygous = True
    numeric_sample_ids = NumericSampleIds()
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            try:
                recombination_cost_computer = GeneticMapRecombinationCostComputer(
                    genmap)
            except ParseError as e:
                raise CommandLineError(e)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            if full_genotyping:
                positions = [v.position for v in variant_table.variants]
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        bam_sample = None if ignore_read_groups else sample
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            bam_sample,
                            read_vcf=False,
                        )
                        readset.sort()  # TODO can be removed
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        variant_table.set_genotypes_of(sample, genotypes)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                GenotypeLikelihoods(gl)
                                for gl in genotype_likelihoods
                            ],
                        )

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            phasable_variant_table.variants,
                            sample,
                        )

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    optimal_cost = dp_table.get_optimal_cost()
                    logger.info("%s cost: %d", problem_name, optimal_cost)

                with timers("components"):
                    master_block = None
                    heterozygous_positions_by_sample = None
                    # If we distrusted genotypes, we need to re-determine which sites are h**o-/heterozygous after phasing
                    if distrust_genotypes:
                        hom_in_any_sample = set()
                        heterozygous_positions_by_sample = {}
                        heterozygous_gts = frozenset({(0, 1), (1, 0)})
                        homozygous_gts = frozenset({(0, 0), (1, 1)})
                        for sample, sample_superreads in zip(
                                family, superreads_list):
                            hets = set()
                            for v1, v2 in zip(*sample_superreads):
                                assert v1.position == v2.position
                                if v1.position not in accessible_positions:
                                    continue
                                gt = (v1.allele, v2.allele)
                                if gt in heterozygous_gts:
                                    hets.add(v1.position)
                                elif gt in homozygous_gts:
                                    hom_in_any_sample.add(v1.position)
                            heterozygous_positions_by_sample[
                                numeric_sample_ids[sample]] = hets
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(hom_in_any_sample)
                    else:
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(
                                set(homozygous_positions).intersection(
                                    set(accessible_positions)))
                    overall_components = find_components(
                        accessible_positions,
                        all_reads,
                        master_block,
                        heterozygous_positions_by_sample,
                    )
                    n_phased_blocks = len(set(overall_components.values()))
                    logger.info("No. of phased blocks: %d", n_phased_blocks)
                    largest_component = find_largest_component(
                        overall_components)
                    if len(largest_component) > 0:
                        logger.info(
                            "Largest component contains %d variants (%.1f%% of accessible variants) between position %d and %d",
                            len(largest_component),
                            len(largest_component) * 100.0 /
                            len(accessible_positions),
                            largest_component[0] + 1,
                            largest_component[-1] + 1,
                        )

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations,
                    )

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)