Ejemplo n.º 1
0
def test_init():
    """test init"""
    gtyper = genotyper.Genotyper(0, 20, 0.0001)
    assert gtyper.min_cov_more_than_error == 0
    assert gtyper.no_of_successes == 0
    assert gtyper.prob_of_success == 0

    gtyper = genotyper.Genotyper(10, 20, 0.0001)
    assert gtyper.no_of_successes == 10
    assert gtyper.prob_of_success == 0.5
    assert gtyper.min_cov_more_than_error == 1

    gtyper = genotyper.Genotyper(10, 20, 0.01)
    assert gtyper.no_of_successes == 10
    assert gtyper.prob_of_success == 0.5
    assert gtyper.min_cov_more_than_error == 2

    gtyper = genotyper.Genotyper(100, 200, 0.001)
    assert gtyper.no_of_successes == 100
    assert gtyper.prob_of_success == 0.5
    assert gtyper.min_cov_more_than_error == 8

    # variance < mean, so will hit the code where it forces
    # variance = 2 * mean = 20
    gtyper = genotyper.Genotyper(10, 5, 0.01)
    assert gtyper.no_of_successes == 10
    assert gtyper.prob_of_success == 0.5
    assert gtyper.min_cov_more_than_error == 2
Ejemplo n.º 2
0
    def _simulate_confidence_scores(cls, mean_depth, depth_variance, error_rate, iterations, allele_length=1, seed=42):
        np.random.seed(seed)
        allele_groups_dict = {'1': {0}, '2': {1}}
        i = 0
        confidences = []
        # We can't use the negative binomial unless depth_variance > mean_depth.
        # So force it to be so.
        if depth_variance < mean_depth:
            depth_variance = 2 * mean_depth
            logging.warn('Variance in read depth is smaller than mean read depth. Setting variance = 2 * mean, so that variant simulations can run. GT_CONF_PERCENTILE in the output VCF file may not be very useful as a result of this.')
        no_of_successes = (mean_depth ** 2) / (depth_variance - mean_depth)
        prob_of_success = 1 - (depth_variance - mean_depth) / depth_variance

        while i < iterations:
            correct_coverage = np.random.negative_binomial(no_of_successes, prob_of_success)
            incorrect_coverage = np.random.binomial(mean_depth, error_rate)
            if correct_coverage + incorrect_coverage == 0:
                continue

            allele_combination_cov = {}
            if incorrect_coverage > 0:
                allele_combination_cov['1'] = incorrect_coverage
            if correct_coverage > 0:
                allele_combination_cov['2'] = correct_coverage
            allele_per_base_cov = [[incorrect_coverage] * allele_length, [correct_coverage] * allele_length]
            gtyper = genotyper.Genotyper(mean_depth, error_rate, allele_combination_cov, allele_per_base_cov, allele_groups_dict)
            gtyper.run()
            confidences.append(round(gtyper.genotype_confidence))
            i += 1

        assert len(confidences) == iterations
        confidences.sort()
        return confidences
Ejemplo n.º 3
0
def test_log_likelihood_homozygous():
    """test _log_likelihood_homozygous"""
    gtyper = genotyper.Genotyper(100, 200, 0.01)
    allele_depth = 90
    total_depth = 95
    allele_length = 5
    non_zeros = allele_length
    got = gtyper._log_likelihood_homozygous(allele_depth, total_depth,
                                            allele_length, non_zeros)
    assert round(got, 2) == -26.78

    gtyper = genotyper.Genotyper(10, 200, 0.01)
    allele_depth = 1
    total_depth = 9
    got = gtyper._log_likelihood_homozygous(allele_depth, total_depth,
                                            allele_length, non_zeros)
    assert round(got, 2) == -39.34
Ejemplo n.º 4
0
def test_run_zero_coverage():
    """test run when all alleles have zero coverage"""
    gtyper = genotyper.Genotyper(20, 40, 0.01)
    allele_combination_cov = {}
    allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}}
    allele_per_base_cov = [[0], [0, 0]]
    gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict)
    assert gtyper.genotype == {"."}
    assert gtyper.genotype_confidence == 0.0
    assert gtyper.genotype_frs == "."
Ejemplo n.º 5
0
 def test_run_zero_coverage(self):
     '''test run when all alleles have zero coverage'''
     mean_depth = 20
     error_rate = 0.01
     allele_combination_cov = {}
     allele_groups_dict = {'1': {0}, '2': {1}, '3': {0, 1}, '4': {2}}
     allele_per_base_cov = [[0], [0, 0]]
     gtyper = genotyper.Genotyper(mean_depth, error_rate,
                                  allele_combination_cov,
                                  allele_per_base_cov, allele_groups_dict)
     gtyper.run()
     self.assertEqual({'.'}, gtyper.genotype)
     self.assertEqual(0.0, gtyper.genotype_confidence)
Ejemplo n.º 6
0
def test_nomatherror_mean_depth0():
    """
    Can get a mean_depth of zero but try to genotype a non-zero coverage site due to rounding imprecision.
    In which case we need to avoid trying to do log(0) in likelihood calculation and should return no call.
    """
    gtyper = genotyper.Genotyper(0, 0, 0.01)
    allele_combination_cov = {"1": 1}
    allele_groups_dict = {"1": {0}, "2": {1}}
    allele_per_base_cov = [[1], [0, 0]]
    gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict)
    assert gtyper.genotype == {"."}
    assert gtyper.genotype_confidence == 0.0
    assert gtyper.genotype_frs == "."
Ejemplo n.º 7
0
    def _simulate_confidence_scores(
        cls,
        mean_depth,
        depth_variance,
        error_rate,
        iterations,
        allele_length=1,
        seed=42,
        call_hets=False,
    ):
        np.random.seed(seed)
        allele_groups_dict = {"1": {0}, "2": {1}}
        i = 0
        confidences = []
        gtyper = genotyper.Genotyper(
            mean_depth, depth_variance, error_rate, call_hets=call_hets,
        )
        logging.debug(
            "Simulation:\titeration\tcorrect_coverage\tincorrect_coverage\tgenotype_confidence"
        )

        while i < iterations:
            correct_coverage = np.random.negative_binomial(
                gtyper.no_of_successes, gtyper.prob_of_success
            )

            incorrect_coverage = np.random.binomial(mean_depth, error_rate)
            if correct_coverage + incorrect_coverage == 0:
                continue

            allele_combination_cov = {}
            if incorrect_coverage > 0:
                allele_combination_cov["1"] = incorrect_coverage
            if correct_coverage > 0:
                allele_combination_cov["2"] = correct_coverage
            allele_per_base_cov = [
                [incorrect_coverage] * allele_length,
                [correct_coverage] * allele_length,
            ]
            gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict)
            logging.debug(
                f"Simulation:\t{i}\t{correct_coverage}\t{incorrect_coverage}\t{gtyper.genotype_confidence}"
            )
            confidences.append(round(gtyper.genotype_confidence))
            i += 1

        assert len(confidences) == iterations
        confidences.sort()
        return confidences
Ejemplo n.º 8
0
def test_run():
    """test run"""
    gtyper = genotyper.Genotyper(20, 40, 0.01)
    allele_combination_cov = {"1": 2, "2": 20, "3": 1}
    allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}}
    allele_per_base_cov = [[0, 1], [20, 19]]
    gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict)
    depth0 = round(3 / 23, 4)
    depth1 = round(21 / 23, 4)
    expected = [({1}, -12.03, depth1), ({0}, -114.57, depth0)]
    assert len(gtyper.likelihoods) == len(expected)
    for i in range(len(expected)):
        assert gtyper.likelihoods[i][0] == expected[i][0]
        assert round(gtyper.likelihoods[i][1], 2) == round(expected[i][1], 2)
        assert gtyper.likelihoods[i][2] == expected[i][2]
Ejemplo n.º 9
0
 def test_run_zero_coverage(self):
     """test run when all alleles have zero coverage"""
     mean_depth = 20
     error_rate = 0.01
     allele_combination_cov = {}
     allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}}
     allele_per_base_cov = [[0], [0, 0]]
     gtyper = genotyper.Genotyper(
         mean_depth,
         error_rate,
         allele_combination_cov,
         allele_per_base_cov,
         allele_groups_dict,
     )
     gtyper.run()
     self.assertEqual({"."}, gtyper.genotype)
     self.assertEqual(0.0, gtyper.genotype_confidence)
Ejemplo n.º 10
0
 def test_calculate_log_likelihoods(self):
     '''test _calculate_log_likelihoods'''
     mean_depth = 20
     error_rate = 0.01
     allele_combination_cov = {'1': 2, '2': 20, '3': 1}
     allele_groups_dict = {'1': {0}, '2': {1}, '3': {0, 1}, '4': {2}}
     allele_per_base_cov = [[0, 1], [20, 19]]
     gtyper = genotyper.Genotyper(mean_depth, error_rate,
                                  allele_combination_cov,
                                  allele_per_base_cov, allele_groups_dict)
     gtyper._calculate_log_likelihoods()
     expected = [
         ({1}, -11.68),
         ({0, 1}, -22.92),
         ({0}, -124.91),
     ]
     self.assertEqual(3, len(gtyper.likelihoods))
     gtyper.likelihoods = [(x[0], round(x[1], 2))
                           for x in gtyper.likelihoods]
     self.assertEqual(expected, gtyper.likelihoods)
Ejemplo n.º 11
0
 def test_nomatherror_mean_depth0(self):
     """
     Can get a mean_depth of zero but try to genotype a non-zero coverage site due to rounding imprecision.
     In which case we need to avoid trying to do log(0) in likelihood calculation and should return no call.
     """
     mean_depth = 0
     error_rate = 0.01
     allele_combination_cov = {"1": 1}
     allele_groups_dict = {"1": {0}, "2": {1}}
     allele_per_base_cov = [[1], [0, 0]]
     gtyper = genotyper.Genotyper(
         mean_depth,
         error_rate,
         allele_combination_cov,
         allele_per_base_cov,
         allele_groups_dict,
     )
     gtyper.run()
     self.assertEqual({"."}, gtyper.genotype)
     self.assertEqual(0.0, gtyper.genotype_confidence)
Ejemplo n.º 12
0
 def test_run(self):
     '''test run'''
     mean_depth = 20
     error_rate = 0.01
     allele_combination_cov = {'1': 2, '2': 20, '3': 1}
     allele_groups_dict = {'1': {0}, '2': {1}, '3': {0, 1}, '4': {2}}
     allele_per_base_cov = [[0, 1], [20, 19]]
     gtyper = genotyper.Genotyper(mean_depth, error_rate,
                                  allele_combination_cov,
                                  allele_per_base_cov, allele_groups_dict)
     expected = [
         ({1}, -11.68),
         ({0, 1}, -22.92),
         ({0}, -124.91),
     ]
     gtyper.run()
     self.assertEqual(len(expected), len(gtyper.likelihoods))
     for i in range(len(expected)):
         self.assertEqual(expected[i][0], gtyper.likelihoods[i][0])
         self.assertAlmostEqual(expected[i][1],
                                gtyper.likelihoods[i][1],
                                places=2)
Ejemplo n.º 13
0
def test_calculate_log_likelihoods():
    """test _calculate_log_likelihoods"""
    gtyper = genotyper.Genotyper(20, 40, 0.01)
    allele_combination_cov = {"1": 2, "2": 20, "3": 1}
    allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}}
    allele_per_base_cov = [[0, 1], [20, 19]]
    depth0 = round(3 / 23, 4)
    depth01 = 1
    depth1 = round(21 / 23, 4)
    gtyper._init_alleles_and_genotypes(
        allele_combination_cov=allele_combination_cov,
        allele_per_base_cov=allele_per_base_cov,
        allele_groups_dict=allele_groups_dict,
    )
    gtyper._calculate_log_likelihoods()
    assert len(gtyper.likelihoods) == 2
    expected = [
        ({1}, -12.03, depth1),
        ({0}, -114.57, depth0),
    ]
    gtyper.likelihoods = [(x[0], round(x[1], 2), x[2])
                          for x in gtyper.likelihoods]
    assert gtyper.likelihoods == expected
Ejemplo n.º 14
0
def update_vcf_record_using_gramtools_allele_depths(
    vcf_record,
    allele_combination_cov,
    allele_per_base_cov,
    allele_groups_dict,
    mean_depth,
    read_error_rate,
    min_cov_more_than_error=None,
):
    """allele_depths should be a dict of allele -> coverage.
    The REF allele must also be in the dict.
    So keys of dict must be equal to REF + ALTs sequences.
    This also changes all columns from QUAL onwards.
    Returns a VcfRecord the same as vcf_record, but with all zero
    coverage alleles removed, and GT and COV fixed accordingly"""
    gtyper = genotyper.Genotyper(
        mean_depth,
        read_error_rate,
        allele_combination_cov,
        allele_per_base_cov,
        allele_groups_dict,
        min_cov_more_than_error=min_cov_more_than_error,
    )
    gtyper.run()
    genotype_indexes = set()

    if "." in gtyper.genotype:
        genotype = "./."
    else:
        if 0 in gtyper.genotype:
            genotype_indexes.add(0)
        for i in range(len(vcf_record.ALT)):
            if i + 1 in gtyper.genotype:
                genotype_indexes.add(i + 1)

        if len(genotype_indexes) == 1:
            genotype_index = genotype_indexes.pop()
            genotype = str(genotype_index) + "/" + str(genotype_index)
            genotype_indexes.add(genotype_index)
        else:
            genotype = "/".join(
                [str(x) for x in sorted(list(genotype_indexes))])

    cov_values = [
        gtyper.singleton_allele_coverages.get(x, 0)
        for x in range(1 + len(vcf_record.ALT))
    ]
    cov_string = ",".join([str(x) for x in cov_values])
    vcf_record.QUAL = None
    vcf_record.INFO.clear()
    vcf_record.FILTER = set()
    vcf_record.FORMAT.clear()
    vcf_record.set_format_key_value("DP",
                                    str(sum(allele_combination_cov.values())))
    vcf_record.set_format_key_value("GT", genotype)
    vcf_record.set_format_key_value("COV", cov_string)
    vcf_record.set_format_key_value("GT_CONF", str(gtyper.genotype_confidence))

    # Make new record where all zero coverage alleles are removed
    filtered_record = copy.deepcopy(vcf_record)
    if genotype in ["./.", "0/0"]:
        return filtered_record

    indexes_to_keep = set(
        [i for i in range(len(cov_values)) if i == 0 or cov_values[i] > 0])
    indexes_to_keep.update(genotype_indexes)
    indexes_to_keep = list(indexes_to_keep)
    indexes_to_keep.sort()
    filtered_record.set_format_key_value(
        "COV", ",".join([str(cov_values[i]) for i in indexes_to_keep]))
    assert indexes_to_keep[0] == 0
    filtered_record.ALT = [
        filtered_record.ALT[i - 1] for i in indexes_to_keep[1:]
    ]

    # The indexes of the genotype string 'n/m' are shifted because
    # we probably removed some alleles
    genotype_strings = {
        vcf_record.REF if i == 0 else vcf_record.ALT[i - 1]
        for i in genotype_indexes
    }
    new_genotype_indexes = set()
    if 0 in genotype_indexes:
        new_genotype_indexes.add(0)
    for i, genotype_string in enumerate(filtered_record.ALT):
        if genotype_string in genotype_strings:
            new_genotype_indexes.add(i + 1)
            if len(genotype_strings) == len(new_genotype_indexes):
                break

    new_genotype_indexes = list(new_genotype_indexes)
    if len(new_genotype_indexes) == 1:
        new_genotype_indexes.append(new_genotype_indexes[0])
    assert len(new_genotype_indexes) == 2
    filtered_record.set_format_key_value(
        "GT", "/".join([str(x) for x in new_genotype_indexes]))
    return filtered_record
Ejemplo n.º 15
0
def write_vcf_annotated_using_coverage_from_gramtools(
    mean_depth,
    depth_variance,
    vcf_records,
    all_allele_coverage,
    allele_groups,
    read_error_rate,
    outfile,
    sample_name="SAMPLE",
    filtered_outfile=None,
    ref_seq_lengths=None,
    call_hets=False,
):
    """mean_depth, vcf_records, all_allele_coverage, allele_groups should be those
    returned by load_gramtools_vcf_and_allele_coverage_files().
    Writes a new VCF that has allele counts for all the ALTs"""
    assert len(vcf_records) == len(all_allele_coverage)
    if call_hets:
        raise NotImplementedError("Heterozygous calling is not implemented")


    header_lines = [
        "##fileformat=VCFv4.2",
        "##source=minos, version " + minos_version,
        "##fileDate=" + str(datetime.date.today()),
        '##FORMAT=<ID=COV,Number=R,Type=Integer,Description="Number of reads on ref and alt alleles">',
        '##FORMAT=<ID=FRS,Number=1,Type=Float,Description="Fraction of reads that support the genotype call">',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total read depth from gramtools">',
        '##FORMAT=<ID=DPF,Number=1,Type=Float,Description="Depth Fraction, defined as DP divided by mean depth">',
        '##FORMAT=<ID=GT_CONF,Number=1,Type=Float,Description="Genotype confidence. Difference in log likelihood of most likely and next most likely genotype">',
        f"##minosMeanReadDepth={mean_depth}",
    ]

    if ref_seq_lengths is not None:
        for name, length in sorted(ref_seq_lengths.items()):
            header_lines.append(f"##contig=<ID={name},length={length}>")

    header_lines.append(
        "\t".join(
            [
                "#CHROM",
                "POS",
                "ID",
                "REF",
                "ALT",
                "QUAL",
                "FILTER",
                "INFO",
                "FORMAT",
                sample_name,
            ]
        )
    )

    gtyper = genotyper.Genotyper(
        mean_depth, depth_variance, read_error_rate, call_hets=call_hets,
    )

    if filtered_outfile is not None:
        f_filter = open(filtered_outfile, "w")
        print(*header_lines, sep="\n", file=f_filter)

    with open(outfile, "w") as f:
        print(*header_lines, sep="\n", file=f)

        for i in range(len(vcf_records)):
            logging.debug("Genotyping: " + str(vcf_records[i]))
            filtered_record = update_vcf_record_using_gramtools_allele_depths(
                vcf_records[i],
                gtyper,
                all_allele_coverage[i][0],
                all_allele_coverage[i][1],
                allele_groups,
            )
            print(vcf_records[i], file=f)
            if filtered_outfile is not None:
                print(filtered_record, file=f_filter)

    if filtered_outfile is not None:
        f_filter.close()
Ejemplo n.º 16
0
def update_vcf_record_using_gramtools_allele_depths(
        vcf_record, allele_combination_cov, allele_per_base_cov,
        allele_groups_dict, mean_depth, read_error_rate, kmer_size):
    '''allele_depths should be a dict of allele -> coverage.
    The REF allele must also be in the dict.
    So keys of dict must be equal to REF + ALTs sequences.
    This also changes all columns from QUAL onwards.
    Returns a VcfRecord the same as vcf_record, but with all zero
    coverage alleles removed, and GT and COV fixed accordingly'''
    gtyper = genotyper.Genotyper(mean_depth, read_error_rate,
                                 allele_combination_cov, allele_per_base_cov,
                                 allele_groups_dict)
    gtyper.run()
    genotype_indexes = set()

    if '.' in gtyper.genotype:
        genotype = './.'
    else:
        if 0 in gtyper.genotype:
            genotype_indexes.add(0)
        for i in range(len(vcf_record.ALT)):
            if i + 1 in gtyper.genotype:
                genotype_indexes.add(i + 1)

        if len(genotype_indexes) == 1:
            genotype_index = genotype_indexes.pop()
            genotype = str(genotype_index) + '/' + str(genotype_index)
            genotype_indexes.add(genotype_index)
        else:
            genotype = '/'.join(
                [str(x) for x in sorted(list(genotype_indexes))])

    cov_values = [
        gtyper.singleton_alleles_cov.get(x, 0)
        for x in range(1 + len(vcf_record.ALT))
    ]
    cov_string = ','.join([str(x) for x in cov_values])
    vcf_record.QUAL = None
    vcf_record.FILTER = '.'
    vcf_record.INFO = {'KMER': str(kmer_size)}
    vcf_record.format_keys = ['DP', 'GT', 'COV', 'GT_CONF']
    vcf_record.FORMAT = {
        'DP': str(sum(allele_combination_cov.values())),
        'GT': genotype,
        'COV': cov_string,
        'GT_CONF': str(gtyper.genotype_confidence)
    }

    # Make new record where all zero coverage alleles are removed
    filtered_record = copy.deepcopy(vcf_record)
    if genotype in ['./.', '0/0']:
        return filtered_record

    indexes_to_keep = set(
        [i for i in range(len(cov_values)) if i == 0 or cov_values[i] > 0])
    indexes_to_keep.update(genotype_indexes)
    indexes_to_keep = list(indexes_to_keep)
    indexes_to_keep.sort()
    filtered_record.FORMAT['COV'] = ','.join(
        [str(cov_values[i]) for i in indexes_to_keep])
    assert indexes_to_keep[0] == 0
    filtered_record.ALT = [
        filtered_record.ALT[i - 1] for i in indexes_to_keep[1:]
    ]

    # The indexes of the genotype string 'n/m' are shifted because
    # we probably removed some alleles
    genotype_strings = {
        vcf_record.REF if i == 0 else vcf_record.ALT[i - 1]
        for i in genotype_indexes
    }
    new_genotype_indexes = set()
    if 0 in genotype_indexes:
        new_genotype_indexes.add(0)
    for i, genotype_string in enumerate(filtered_record.ALT):
        if genotype_string in genotype_strings:
            new_genotype_indexes.add(i + 1)
            if len(genotype_strings) == len(new_genotype_indexes):
                break

    new_genotype_indexes = list(new_genotype_indexes)
    if len(new_genotype_indexes) == 1:
        new_genotype_indexes.append(new_genotype_indexes[0])
    assert len(new_genotype_indexes) == 2
    filtered_record.FORMAT['GT'] = '/'.join(
        [str(x) for x in new_genotype_indexes])
    return filtered_record