Example #1
0
def test_genotyping_trio5():
    reads = """
      B 101
      B 101
      B 101
      A 111
      A 111
      A 111
      C 111
      C 111
      C 101
      C 101
    """
    expected_genotypes = [
        canonic_index_list_to_biallelic_gt_list([2, 2, 2]),
        canonic_index_list_to_biallelic_gt_list([2, 0, 2]),
        canonic_index_list_to_biallelic_gt_list([2, 1, 2]),
    ]
    numeric_sample_ids = NumericSampleIds()
    pedigree = Pedigree(numeric_sample_ids)
    pedigree.add_individual(
        "individual0",
        canonic_index_list_to_biallelic_gt_list([0, 0, 0]),
        [PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])] * 3,
    )
    pedigree.add_individual(
        "individual1",
        canonic_index_list_to_biallelic_gt_list([0, 0, 0]),
        [PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])] * 3,
    )
    pedigree.add_individual(
        "individual2",
        canonic_index_list_to_biallelic_gt_list([0, 0, 0]),
        [PhredGenotypeLikelihoods([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])] * 3,
    )
    pedigree.add_relationship("individual0", "individual1", "individual2")
    recombcost = [2, 2, 2]
    genotype_pedigree(numeric_sample_ids, reads, recombcost, pedigree, expected_genotypes)
Example #2
0
def verify(rs, all_heterozygous=False):
    positions = rs.get_positions()
    recombcost = [1] * len(
        positions)  # recombination costs 1, should not occur
    pedigree = Pedigree(NumericSampleIds())
    genotype_likelihoods = [
        None if all_heterozygous else PhredGenotypeLikelihoods(0, 0, 0)
    ] * len(positions)
    pedigree.add_individual('individual0', [1] * len(positions),
                            genotype_likelihoods)  # all genotypes heterozygous
    dp_table = PedigreeDPTable(rs,
                               recombcost,
                               pedigree,
                               distrust_genotypes=not all_heterozygous)
    verify_mec_score_and_partitioning(dp_table, rs)
def test_genotyping_trio14():
    reads = """
      A 111111
      A 111111
      B 111111
      B 000000
      C 000000
    """

    expected_genotypes = [
        canonic_index_list_to_biallelic_gt_list([2, 2, 2, 2, 2, 2]),
        canonic_index_list_to_biallelic_gt_list([1, 1, 1, 1, 1, 1]),
        canonic_index_list_to_biallelic_gt_list([1, 1, 1, 1, 1, 1]),
    ]
    numeric_sample_ids = NumericSampleIds()
    pedigree = Pedigree(numeric_sample_ids)
    pedigree.add_individual(
        "individual0",
        canonic_index_list_to_biallelic_gt_list([0, 0, 0, 0, 0, 0]),
        [PhredGenotypeLikelihoods([1 / 3.0, 1 / 3.0, 1 / 3.0])] * 6,
    )
    pedigree.add_individual(
        "individual1",
        canonic_index_list_to_biallelic_gt_list([0, 0, 0, 0, 0, 0]),
        [PhredGenotypeLikelihoods([1 / 3.0, 1 / 3.0, 1 / 3.0])] * 6,
    )
    pedigree.add_individual(
        "individual2",
        canonic_index_list_to_biallelic_gt_list([0, 0, 0, 0, 0, 0]),
        [PhredGenotypeLikelihoods([1 / 3.0, 1 / 3.0, 1 / 3.0])] * 6,
    )
    pedigree.add_relationship("individual0", "individual1", "individual2")
    recombcost = [1000000, 1000000, 1000000, 1000000, 1000000, 1000000]
    genotype_pedigree(
        numeric_sample_ids, reads, recombcost, pedigree, expected_genotypes, scaling=1000,
    )
def verify(rs, all_heterozygous=False):
    positions = rs.get_positions()
    # recombination costs 1, should not occur
    recombcost = [1] * len(positions)
    pedigree = Pedigree(NumericSampleIds())
    genotype_likelihoods = [
        None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
    ] * len(positions)
    # all genotypes heterozygous
    pedigree.add_individual(
        "individual0",
        [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
        genotype_likelihoods,
    )
    dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes=not all_heterozygous)
    verify_mec_score_and_partitioning(dp_table, rs)
Example #5
0
def check_genotyping_single_individual(reads, weights = None, expected = None, genotypes = None, scaling = None, genotype_priors = None):
	# 0) set up read set
	readset = string_to_readset(s=reads, w=weights, scale_quality=scaling)
	positions = readset.get_positions()

	# 1) Genotype using forward backward algorithm
	recombcost = [1] * len(positions)
	numeric_sample_ids = NumericSampleIds()
	pedigree = Pedigree(numeric_sample_ids)
	genotype_likelihoods = [PhredGenotypeLikelihoods(1.0/3.0,1.0/3.0,1.0/3.0)] * len(positions)

	if genotype_priors != None:
		genotype_likelihoods = genotype_priors

	pedigree.add_individual('individual0', [1] * len(positions), genotype_likelihoods)
	dp_forward_backward = GenotypeDPTable(numeric_sample_ids, readset, recombcost,pedigree)

	# check the results
	compare_to_expected(dp_forward_backward, positions, expected, genotypes)
Example #6
0
def phase_MAV(reads, n_alleles, all_het, genos, genotypes, weights=None):
    readset = string_to_readset(reads, n_alleles)
    positions = readset.get_positions()
    for all_heterozygous in all_het:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods(genos)
        ] * len(positions)
        pedigree.add_individual(
            'individual0', genotypes,
            genotype_likelihoods)  # all genotypes heterozygous
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        superreads_list, transmission_vector = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
    return superreads_list, transmission_vector, cost
Example #7
0
def create_pedigree(
    default_gq,
    distrust_genotypes,
    family,
    gl_regularizer,
    numeric_sample_ids,
    phasable_variant_table,
    trios,
):
    pedigree = Pedigree(numeric_sample_ids)
    for sample in family:
        # If distrusting genotypes, we pass genotype likelihoods on to pedigree object
        if distrust_genotypes:
            genotype_likelihoods = []
            for gt, gl in zip(
                    phasable_variant_table.genotypes_of(sample),
                    phasable_variant_table.genotype_likelihoods_of(sample),
            ):
                assert gt.is_diploid_and_biallelic()
                if gl is None:
                    # all genotypes get default_gq as genotype likelihood, exept the called genotype ...
                    x = [default_gq] * 3
                    # ... which gets a 0
                    x[gt.get_index()] = 0
                    genotype_likelihoods.append(PhredGenotypeLikelihoods(x))
                else:
                    genotype_likelihoods.append(
                        gl.as_phred(regularizer=gl_regularizer))
        else:
            genotype_likelihoods = None
        pedigree.add_individual(sample,
                                phasable_variant_table.genotypes_of(sample),
                                genotype_likelihoods)
    for trio in trios:
        pedigree.add_relationship(father_id=trio.father,
                                  mother_id=trio.mother,
                                  child_id=trio.child)
    return pedigree
Example #8
0
def test_geno_10():
    reads = """
    001100
    000000
    000000
    110011
    110011
    111111
         """
    genotypes = canonic_index_list_to_biallelic_gt_list([1, 1, 0, 0, 1, 1])
    genotype_priors = [
        PhredGenotypeLikelihoods([0.1, 0.8, 0.1]),
        PhredGenotypeLikelihoods([0.1, 0.8, 0.1]),
        PhredGenotypeLikelihoods([0.7, 0.2, 0.1]),
        PhredGenotypeLikelihoods([0.7, 0.2, 0.1]),
        PhredGenotypeLikelihoods([0.1, 0.8, 0.1]),
        PhredGenotypeLikelihoods([0.1, 0.8, 0.1]),
    ]
    check_genotyping_single_individual(reads, None, None, genotypes, 50, genotype_priors)
Example #9
0
def test_geno_priors2():
    reads = """
            11
             01
             """

    prior_likelihoods = [
        PhredGenotypeLikelihoods([0, 0.5, 0.5]),
        PhredGenotypeLikelihoods([0.25, 0.5, 0.25]),
        PhredGenotypeLikelihoods([0.1, 0.4, 0.5]),
    ]
    expected_likelihoods = [
        PhredGenotypeLikelihoods([0.0, 0.35714285714285715, 0.6428571428571429]),
        PhredGenotypeLikelihoods([0.1323529411764706, 0.7352941176470589, 0.1323529411764706]),
        PhredGenotypeLikelihoods([0.015151515151515152, 0.30303030303030304, 0.6818181818181818]),
    ]
    check_genotyping_single_individual(
        reads, None, expected_likelihoods, None, 10, prior_likelihoods
    )
Example #10
0
def check_phasing_single_individual(reads, algorithm="whatshap", weights=None):
    # 0) set up read set
    readset = string_to_readset(reads, weights)
    positions = readset.get_positions()

    # for hapchat
    if algorithm == "hapchat":
        dp_table = HapChatCore(readset)
        superreads = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0][0], cost, partition, readset,
                                    True, weights, algorithm)
        return

    # 1) Phase using PedMEC code for single individual
    for all_heterozygous in [False, True]:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
        ] * len(positions)
        pedigree.add_individual(
            "individual0",
            [canonic_index_to_biallelic_gt(1) for i in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        superreads, transmission_vector = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
        # TODO: transmission vectors not returned properly, see issue 73
        assert len(set(transmission_vector)) == 1
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0], cost, partition, readset,
                                    all_heterozygous, weights)

    # 2) Phase using PedMEC code for trios with two "empty" individuals (i.e. having no reads)
    for all_heterozygous in [False, True]:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
        ] * len(positions)
        pedigree.add_individual(
            "individual0",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_individual(
            "individual1",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_individual(
            "individual2",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_relationship("individual0", "individual1", "individual2")
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        cost = dp_table.get_optimal_cost()
        superreads, transmission_vector = dp_table.get_super_reads()
        assert len(set(transmission_vector)) == 1
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0], cost, partition, readset,
                                    all_heterozygous, weights)
Example #11
0
def run_genotype(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    max_coverage=15,
    nopriors=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    gt_qual_threshold=0,
    prioroutput=None,
    constant=0.0,
    overhang=10,
    affine_gap=False,
    gap_start=10,
    gap_extend=7,
    mismatch=15,
    write_command_line_header=True,
    use_ped_samples=False,
):
    """
    For now: this function only runs the genotyping algorithm. Genotype likelihoods for
    all variants are computed using the forward backward algorithm
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (genotyping) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None
    with ExitStack() as stack:
        # read the given input files (BAMs, VCFs, ref...)
        numeric_sample_ids = NumericSampleIds()
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
                overhang=overhang,
                affine=affine_gap,
                gap_start=gap_start,
                gap_extend=gap_extend,
                default_mismatch=mismatch,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # vcf writer for final genotype likelihoods
        vcf_writer = stack.enter_context(
            GenotypeVcfWriter(command_line=command_line,
                              in_path=variant_file,
                              out_file=output))
        # vcf writer for only the prior likelihoods (if output is desired)
        prior_vcf_writer = None
        if prioroutput is not None:
            prior_vcf_writer = stack.enter_context(
                GenotypeVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=stack.enter_context(open(prioroutput, "w")),
                ))

        # parse vcf with input variants
        # remove all likelihoods that may already be present
        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                genotype_likelihoods=False,
                ignore_genotypes=True,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = set()
            for trio in PedReader(ped):
                if trio.child is None or trio.mother is None or trio.father is None:
                    continue
                samples.add(trio.mother)
                samples.add(trio.father)
                samples.add(trio.child)

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            recombination_cost_computer = GeneticMapRecombinationCostComputer(
                genmap)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        # Read phase information provided as VCF files, if provided.
        with timers("parse_phasing_vcfs"):
            phased_input_reader.read_vcfs()

        # compute genotype likelihood threshold
        gt_prob = 1.0 - (10**(-gt_qual_threshold / 10.0))

        for variant_table in timers.iterate("parse_vcf", vcf_reader):

            # create a mapping of genome positions to indices
            var_to_pos = dict()
            for i in range(len(variant_table.variants)):
                var_to_pos[variant_table.variants[i].position] = i

            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                vcf_writer.write_genotypes(chromosome,
                                           variant_table,
                                           indels,
                                           leave_unchanged=True)
                if prioroutput is not None:
                    prior_vcf_writer.write_genotypes(chromosome,
                                                     variant_table,
                                                     indels,
                                                     leave_unchanged=True)
                continue

            positions = [v.position for v in variant_table.variants]
            if not nopriors:
                # compute prior genotype likelihoods based on all reads
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                            read_vcf=False,
                        )
                        readset.sort()
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        # recompute genotypes based on given threshold
                        reg_genotype_likelihoods = []
                        for gl in range(len(genotype_likelihoods)):
                            norm_sum = (genotype_likelihoods[gl][0] +
                                        genotype_likelihoods[gl][1] +
                                        genotype_likelihoods[gl][2] +
                                        3 * constant)
                            regularized = PhredGenotypeLikelihoods([
                                (genotype_likelihoods[gl][0] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][1] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][2] + constant) /
                                norm_sum,
                            ])
                            genotypes[gl] = determine_genotype(
                                regularized, gt_prob)
                            assert isinstance(genotypes[gl], Genotype)
                            reg_genotype_likelihoods.append(regularized)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                PhredGenotypeLikelihoods(list(gl))
                                for gl in reg_genotype_likelihoods
                            ],
                        )
                        variant_table.set_genotypes_of(sample, genotypes)
            else:

                # use uniform genotype likelihoods for all individuals
                for sample in samples:
                    variant_table.set_genotype_likelihoods_of(
                        sample,
                        [PhredGenotypeLikelihoods([1 / 3, 1 / 3, 1 / 3])] *
                        len(positions),
                    )

            # if desired, output the priors in separate vcf
            if prioroutput is not None:
                prior_vcf_writer.write_genotypes(chromosome, variant_table,
                                                 indels)

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert (len(family) == 1) or (len(trios) > 0)

                # Get the reads belonging to each sample
                readsets = dict()
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                        )

                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        selected_reads = select_reads(
                            readset,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )
                    readsets[sample] = selected_reads

                # Merge reads into one ReadSet (note that each Read object
                # knows the sample it originated from).
                all_reads = ReadSet()
                for sample, readset in readsets.items():
                    for read in readset:
                        assert read.is_sorted(), "Add a read.sort() here"
                        all_reads.add(read)

                all_reads.sort()

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )

                # Create Pedigree
                pedigree = Pedigree(numeric_sample_ids)
                for sample in family:
                    # genotypes are assumed to be unknown, so ignore information that
                    # might already be present in the input vcf
                    all_genotype_likelihoods = variant_table.genotype_likelihoods_of(
                        sample)
                    genotype_l = [
                        all_genotype_likelihoods[var_to_pos[a_p]]
                        for a_p in accessible_positions
                    ]
                    pedigree.add_individual(
                        sample,
                        [
                            Genotype([])
                            for i in range(len(accessible_positions))
                        ],
                        genotype_l,
                    )
                for trio in trios:
                    pedigree.add_relationship(
                        father_id=trio.father,
                        mother_id=trio.mother,
                        child_id=trio.child,
                    )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run genotyping algorithm
                with timers("genotyping"):
                    problem_name = "genotyping"
                    logger.info(
                        "Genotype %d sample%s by solving the %s problem ...",
                        len(family),
                        "s" if len(family) > 1 else "",
                        problem_name,
                    )
                    forward_backward_table = GenotypeDPTable(
                        numeric_sample_ids,
                        all_reads,
                        recombination_costs,
                        pedigree,
                        accessible_positions,
                    )
                    # store results
                    for s in family:
                        likelihood_list = variant_table.genotype_likelihoods_of(
                            s)
                        genotypes_list = variant_table.genotypes_of(s)

                        for pos in range(len(accessible_positions)):
                            likelihoods = forward_backward_table.get_genotype_likelihoods(
                                s, pos)

                            # compute genotypes from likelihoods and store information
                            geno = determine_genotype(likelihoods, gt_prob)
                            assert isinstance(geno, Genotype)
                            genotypes_list[var_to_pos[
                                accessible_positions[pos]]] = geno
                            likelihood_list[var_to_pos[
                                accessible_positions[pos]]] = likelihoods

                        variant_table.set_genotypes_of(s, genotypes_list)
                        variant_table.set_genotype_likelihoods_of(
                            s, likelihood_list)

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                vcf_writer.write_genotypes(chromosome, variant_table, indels)
                logger.info("Done writing VCF")

            logger.debug("Chromosome %r finished", chromosome)

    logger.info("\n== SUMMARY ==")
    total_time = timers.total()
    log_memory_usage()
    logger.info(
        "Time spent reading BAM:                      %6.1f s",
        timers.elapsed("read_bam"),
    )
    logger.info(
        "Time spent parsing VCF:                      %6.1f s",
        timers.elapsed("parse_vcf"),
    )
    if show_phase_vcfs:
        logger.info(
            "Time spent parsing input phasings from VCFs: %6.1f s",
            timers.elapsed("parse_phasing_vcfs"),
        )
    logger.info("Time spent selecting reads:                  %6.1f s",
                timers.elapsed("select"))
    logger.info(
        "Time spent genotyping:                          %6.1f s",
        timers.elapsed("genotyping"),
    )
    logger.info(
        "Time spent writing VCF:                      %6.1f s",
        timers.elapsed("write_vcf"),
    )
    logger.info(
        "Time spent on rest:                          %6.1f s",
        total_time - timers.sum(),
    )
    logger.info("Total elapsed time:                          %6.1f s",
                total_time)
Example #12
0
def test_genotype_likelihoods():
    assert list(PhredGenotypeLikelihoods()) == [0, 0, 0]
    assert list(PhredGenotypeLikelihoods(7, 1, 12)) == [7, 1, 12]
    gl = GenotypeLikelihoods(*(math.log10(x) for x in [1e-10, 0.5, 0.002]))
    assert list(gl.as_phred()) == [97, 0, 24]
    assert list(gl.as_phred(regularizer=0.01)) == [20, 0, 19]