Exemple #1
0
def smn_cn_caller(
        bam, region_dic, gmm_parameter,
        snp_db, variant_db, threads, count_file=None, reference_fasta=None):
    """Return SMN CN calls for each sample."""
    # 1. read counting, normalization
    if count_file is not None:
        bamfile = open_alignment_file(bam, reference_fasta)
        reads = bamfile.fetch()
        read_length = get_read_length(reads)
        bamfile.close()
        normalized_depth = get_normed_depth_from_count(
            count_file, region_dic, read_length, gc_correct=False)
    else:
        normalized_depth = get_normed_depth(
            bam, region_dic, threads, reference=reference_fasta, gc_correct=False)

    # 2. GMM and CN call
    cn_call = namedtuple(
        'cn_call', 'exon16_cn exon16_depth exon78_cn exon78_depth'
    )
    gmm_exon16 = Gmm()
    gmm_exon16.set_gmm_par(gmm_parameter, 'exon1-6')
    gcall_exon16 = gmm_exon16.gmm_call(normalized_depth.normalized['exon16'])
    gmm_exon78 = Gmm()
    gmm_exon78.set_gmm_par(gmm_parameter, 'exon7-8')
    gcall_exon78 = gmm_exon78.gmm_call(normalized_depth.normalized['exon78'])
    raw_cn_call = cn_call(
        gcall_exon16.cn, gcall_exon16.depth_value,
        gcall_exon78.cn, gcall_exon78.depth_value
    )

    # 3. Get SNP ratios
    smn1_read_count, smn2_read_count = get_supporting_reads(
        bam, snp_db.dsnp1, snp_db.dsnp2, snp_db.nchr, snp_db.dindex, reference=reference_fasta
    )
    smn1_fraction = get_fraction(smn1_read_count, smn2_read_count)
    var_ref_count, var_alt_count = get_supporting_reads(
        bam, variant_db.dsnp1, variant_db.dsnp2, variant_db.nchr,
        variant_db.dindex, reference=reference_fasta
    )

    # 4. Call CN of SMN1 and SMN2
    final_call = get_smn12_call(
        raw_cn_call, smn1_read_count, smn2_read_count,
        var_ref_count, var_alt_count,
        normalized_depth.mediandepth
    )

    # 5. Prepare final call set
    sample_call = namedtuple(
        'sample_call',
        'Coverage_MAD \
        Full_length_CN_raw Total_CN_raw \
        SMN1_read_support SMN2_read_support SMN1_fraction \
        g27134TG_REF_count g27134TG_ALT_count'
    )
    sample_cn_call = sample_call(
        round(normalized_depth.mad, 3),
        raw_cn_call.exon78_depth, raw_cn_call.exon16_depth,
        smn1_read_count, smn2_read_count, [round(a, 2) for a in smn1_fraction],
        var_ref_count, var_alt_count
    )

    doutput = sample_cn_call._asdict()
    doutput.update(final_call._asdict())
    return doutput
Exemple #2
0
def d6_star_caller(bam,
                   call_parameters,
                   threads,
                   count_file=None,
                   reference_fasta=None):
    """Return CYP2D6 star allele diplotype calls for each sample."""
    d6_call = namedtuple(
        "d6_call",
        "Coverage_MAD Median_depth Total_CN Spacer_CN Total_CN_raw \
        Spacer_CN_raw Variants_called CNV_group Genotype Filter Raw_star_allele \
        Call_info Exon9_CN CNV_consensus d67_snp_call d67_snp_raw \
        Variant_raw_count",
    )
    # 1. Read counting and normalization
    bamfile = open_alignment_file(bam, reference_fasta)
    if count_file is not None:
        reads = bamfile.fetch()
        read_length = get_read_length(reads)
        normalized_depth = get_normed_depth_from_count(
            count_file, call_parameters.region_dic, read_length)
    else:
        normalized_depth = get_normed_depth(bam,
                                            call_parameters.region_dic,
                                            threads,
                                            reference=reference_fasta)

    # no-call after normalizaton
    if normalized_depth.normalized["d67"] is None:
        sample_call = d6_call(
            normalized_depth.mad,
            normalized_depth.mediandepth,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )
        return sample_call

    # 2. GMM and CN call
    # There are two regions to call CN based on depth: total CYP2D6+CYP2D7, and CYP2D7 spacer region
    cn_call = namedtuple("cn_call", "d67_cn d67_depth spacer_cn spacer_depth")
    gmm_d67 = Gmm()
    gmm_d67.set_gmm_par(call_parameters.gmm_parameter, "d67")
    gcall_d67 = gmm_d67.gmm_call(normalized_depth.normalized["d67"])
    gmm_spacer = Gmm()
    gmm_spacer.set_gmm_par(call_parameters.gmm_parameter, "spacer")
    gcall_spacer = gmm_spacer.gmm_call(normalized_depth.normalized["spacer"])
    high_cn_low_confidence = False
    if gcall_d67.cn is None and gcall_d67.depth_value > HIGH_CN_DEPTH_THRESHOLD:
        high_cn_low_confidence = True
        raw_cn_call = cn_call(
            int(round(gcall_d67.depth_value)),
            gcall_d67.depth_value,
            gcall_spacer.cn,
            gcall_spacer.depth_value,
        )
    else:
        raw_cn_call = cn_call(
            gcall_d67.cn,
            gcall_d67.depth_value,
            gcall_spacer.cn,
            gcall_spacer.depth_value,
        )

    # 3. Get allele counts at D6/D7 SNP (base difference) sites and target variant sites
    # D6/D7 base difference sites. Get read counts at both D6/D7 positions.
    snp_db = call_parameters.snp_db
    snp_d6, snp_d7 = get_supporting_reads(
        bam,
        snp_db.dsnp1,
        snp_db.dsnp2,
        snp_db.nchr,
        snp_db.dindex,
        reference=reference_fasta,
    )
    site42126938 = [snp_d6[VAR42126938_SITE], snp_d7[VAR42126938_SITE]]
    snp_d6.pop(VAR42126938_SITE)
    snp_d6.pop(VAR42126938_SITE - 1)
    snp_d7.pop(VAR42126938_SITE)
    snp_d7.pop(VAR42126938_SITE - 1)
    # Variants not in homology regions. Get read counts only at D6 positions.
    var_db = call_parameters.var_db
    var_alt, var_ref = get_supporting_reads_single_region(
        bam,
        var_db.dsnp1,
        var_db.nchr,
        var_db.dindex,
        reference=reference_fasta)
    # Look more carefully for insertions at 42128936 from reads
    var_list = call_parameters.var_list
    ref_read, long_ins_read, short_ins_read = get_allele_counts_42128936(
        bamfile, call_parameters.genome)
    if "g.42128936-42128937insGGGGCGAAAGGGGCGAAA" in var_list:
        long_ins_index = var_list.index(
            "g.42128936-42128937insGGGGCGAAAGGGGCGAAA")
        var_alt[long_ins_index] = long_ins_read
        var_ref[long_ins_index] = short_ins_read + ref_read
    if "g.42128936-42128937insGGGGCGAAA" in var_list:
        short_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAA")
        var_alt[short_ins_index] = short_ins_read
        var_ref[short_ins_index] = long_ins_read + ref_read
    # Variants in homology regions. Get read counts at both D6/D7 positions.
    var_homo_db = call_parameters.var_homo_db
    var_homo_alt, var_homo_ref = get_supporting_reads(
        bam,
        var_homo_db.dsnp1,
        var_homo_db.dsnp2,
        var_homo_db.nchr,
        var_homo_db.dindex,
        reference=reference_fasta,
    )
    # This ordered dictionary is for final reporting.
    raw_count = OrderedDict()
    for i in range(len(call_parameters.var_list)):
        if i < len(var_alt):
            raw_count.setdefault(var_list[i],
                                 "%i,%i" % (var_alt[i], var_ref[i]))
        else:
            raw_count.setdefault(
                var_list[i],
                "%i,%i" % (var_homo_alt[i - len(var_alt)],
                           var_homo_ref[i - len(var_alt)]),
            )
    raw_count.setdefault("g.42126938C>T",
                         "%i,%i" % (site42126938[0], site42126938[1]))

    # no-call due to total copy number calling
    if raw_cn_call.d67_cn is None:
        sample_call = d6_call(
            normalized_depth.mad,
            normalized_depth.mediandepth,
            raw_cn_call.d67_cn,
            raw_cn_call.spacer_cn,
            raw_cn_call.d67_depth,
            raw_cn_call.spacer_depth,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            raw_count,
        )
        return sample_call

    # 4. Call CNV and hybrids
    d6_fraction = get_fraction(snp_d6, snp_d7)
    raw_d6_cn = [round(raw_cn_call.d67_cn * a, 3) for a in d6_fraction]
    cn_call_snp = call_cn_snp(raw_cn_call.d67_cn, snp_d6, snp_d7)

    # exon9gc
    exon9gc_call_stringent = call_exon9gc(snp_d6[EXON9_SITE1],
                                          snp_d7[EXON9_SITE1],
                                          raw_cn_call.d67_cn)
    cnvtag, consensus = get_cnvtag(
        raw_cn_call.d67_cn,
        raw_d6_cn,
        cn_call_snp,
        exon9gc_call_stringent,
        raw_cn_call.spacer_cn,
    )

    # no-call due to CNV group calling
    if cnvtag is None or cnvtag not in CNV_ACCEPTED:
        sample_call = d6_call(
            normalized_depth.mad,
            normalized_depth.mediandepth,
            raw_cn_call.d67_cn,
            raw_cn_call.spacer_cn,
            raw_cn_call.d67_depth,
            raw_cn_call.spacer_depth,
            None,
            cnvtag,
            None,
            None,
            None,
            None,
            exon9gc_call_stringent,
            ",".join(str(a) for a in consensus),
            ",".join(str(a) for a in cn_call_snp),
            ",".join(str(a) for a in raw_d6_cn),
            raw_count,
        )
        return sample_call

    # 5. Call variants
    # homology region
    cn_call_var_homo = call_cn_var_homo(raw_cn_call.d67_cn, var_homo_alt,
                                        var_homo_ref)
    # non-homology region
    cn_call_var = call_cn_var(cnvtag, var_alt, var_ref, var_list, var_db)
    # call g.42126938C>T
    if cnvtag in ["star5", "cn2"]:
        var42126938, G_haplotype = call_var42126938(
            bamfile,
            cnvtag,
            site42126938,
            snp_db,
            [VAR42126938_SITE - 2, VAR42126938_SITE - 1, VAR42126938_SITE],
        )
    else:
        var42126938 = []
        G_haplotype = False

    # 6. Call star allele
    total_callset = get_called_variants(var_list, cn_call_var)
    called_var_homo = get_called_variants(var_list, cn_call_var_homo,
                                          len(cn_call_var))
    total_callset += called_var_homo
    total_callset += var42126938

    exon9_values = namedtuple(
        "exon9_values",
        "exon9_cn exon9cn_in_consensus exon9_raw_site1 exon9_raw_site2")

    star_called = match_star(
        total_callset,
        cnvtag,
        raw_cn_call.spacer_cn,
        call_parameters.star_combinations,
        exon9_values(
            exon9gc_call_stringent,
            consensus.exon9_and_downstream,
            raw_d6_cn[EXON9_SITE1],
            raw_d6_cn[EXON9_SITE2],
        ),
    )

    genotype_filter = None
    # no-call due to star allele matching
    if "no_match" in star_called[
            0]:  # or star_called[0] == 'more_than_one_match':
        final_star_allele_call = None
    elif (star_called[0] == "more_than_one_match"
          and star_called[-1] == "*1/*32;*27/*41"):
        genotype_filter = "PASS"
        if G_haplotype:
            # Variants are on the sample haplotype
            final_star_allele_call = "*1/*32"
        else:
            final_star_allele_call = "*27/*41"
    else:
        final_star_allele_call = star_called[-1]
        if ";" in final_star_allele_call:
            genotype_filter = "More_than_one_possible_genotype"
        elif "/" not in final_star_allele_call:
            genotype_filter = "Not_assigned_to_haplotypes"
        elif high_cn_low_confidence:
            genotype_filter = "LowQ_high_CN"
        else:
            genotype_filter = "PASS"

    sample_call = d6_call(
        normalized_depth.mad,
        normalized_depth.mediandepth,
        raw_cn_call.d67_cn,
        raw_cn_call.spacer_cn,
        raw_cn_call.d67_depth,
        raw_cn_call.spacer_depth,
        star_called.variants_called.split(),
        cnvtag,
        final_star_allele_call,
        genotype_filter,
        star_called.raw_call,
        star_called.call_info,
        exon9gc_call_stringent,
        ",".join(str(a) for a in consensus),
        ",".join(str(a) for a in cn_call_snp),
        ",".join(str(a) for a in raw_d6_cn),
        raw_count,
    )
    bamfile.close()
    return sample_call
Exemple #3
0
def d6_star_caller(
    bam, call_parameters, threads, count_file=None, reference_fasta=None, index_name=None
):
    """Return CYP2D6 star allele diplotype calls for each sample."""
    d6_call = namedtuple(
        "d6_call",
        "Coverage_MAD Median_depth Total_CN Spacer_CN Total_CN_raw \
        Spacer_CN_raw Variants_called CNV_group Genotype Filter Raw_star_allele \
        Call_info Exon9_CN CNV_consensus d67_snp_call d67_snp_raw \
        Variant_raw_count",
    )
    # 1. Read counting and normalization
    bamfile = open_alignment_file(bam, reference_fasta, index_filename=index_name)
    if count_file is not None:
        reads = bamfile.fetch()
        read_length = get_read_length(reads)
        normalized_depth = get_normed_depth_from_count(
            count_file, call_parameters.region_dic, read_length
        )
    else:
        normalized_depth = get_normed_depth(
            bam, call_parameters.region_dic, threads, reference=reference_fasta
        )

    # no-call after normalizaton
    if normalized_depth.normalized["d67"] is None:
        sample_call = d6_call(
            normalized_depth.mad,
            normalized_depth.mediandepth,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )
        return sample_call

    # 2. GMM and CN call
    # There are two regions to call CN based on depth: total CYP2D6+CYP2D7, and CYP2D7 spacer region
    cn_call = namedtuple("cn_call", "d67_cn d67_depth spacer_cn spacer_depth")
    gmm_d67 = Gmm()
    gmm_d67.set_gmm_par(call_parameters.gmm_parameter, "d67")
    gcall_d67 = gmm_d67.gmm_call(normalized_depth.normalized["d67"])
    gmm_spacer = Gmm()
    gmm_spacer.set_gmm_par(call_parameters.gmm_parameter, "spacer")
    gcall_spacer = gmm_spacer.gmm_call(normalized_depth.normalized["spacer"])
    high_cn_low_confidence = False
    if gcall_d67.cn is None and gcall_d67.depth_value > HIGH_CN_DEPTH_THRESHOLD:
        high_cn_low_confidence = True
        raw_cn_call = cn_call(
            int(round(gcall_d67.depth_value)),
            gcall_d67.depth_value,
            gcall_spacer.cn,
            gcall_spacer.depth_value,
        )
    else:
        raw_cn_call = cn_call(
            gcall_d67.cn,
            gcall_d67.depth_value,
            gcall_spacer.cn,
            gcall_spacer.depth_value,
        )

    # 3. Get allele counts at D6/D7 SNP (base difference) sites and target variant sites
    # D6/D7 base difference sites. Get read counts at both D6/D7 positions.
    snp_db = call_parameters.snp_db
    snp_d6, snp_d7 = get_supporting_reads(
        bamfile, snp_db.dsnp1, snp_db.dsnp2, snp_db.nchr, snp_db.dindex
    )

    # Variants not in homology regions. Get read counts only at D6 positions.
    var_db = call_parameters.var_db
    var_alt, var_ref, var_alt_forward, var_alt_reverse = get_supporting_reads_single_region(
        bamfile, var_db.dsnp1, var_db.nchr, var_db.dindex
    )
    # Look more carefully for insertions at 42128936 from reads
    var_list = call_parameters.var_list
    ref_read, long_ins_read, short_ins_read = get_allele_counts_var42128936(
        bamfile, call_parameters.genome
    )
    var_alt, var_ref = update_var42128936(
        var_list, var_alt, var_ref, ref_read, long_ins_read, short_ins_read
    )
    # Variants in homology regions. Get read counts at both D6/D7 positions.
    var_homo_db = call_parameters.var_homo_db
    var_homo_alt, var_homo_ref = get_supporting_reads(
        bamfile,
        var_homo_db.dsnp1,
        var_homo_db.dsnp2,
        var_homo_db.nchr,
        var_homo_db.dindex,
    )
    # This ordered dictionary is for final reporting.
    raw_count = OrderedDict()
    non_homology_variant_count = len(var_alt)
    for i in range(len(call_parameters.var_list)):
        if i < non_homology_variant_count:
            if var_list[i] in NOISY_VAR:
                raw_count.setdefault(
                    var_list[i],
                    "%i(%i:%i),%i"
                    % (var_alt[i], var_alt_forward[i], var_alt_reverse[i], var_ref[i]),
                )
            else:
                raw_count.setdefault(var_list[i], "%i,%i" % (var_alt[i], var_ref[i]))
        else:
            raw_count.setdefault(
                var_list[i],
                "%i,%i"
                % (
                    var_homo_alt[i - non_homology_variant_count],
                    var_homo_ref[i - non_homology_variant_count],
                ),
            )

    # no-call due to total copy number calling
    if raw_cn_call.d67_cn is None:
        sample_call = d6_call(
            normalized_depth.mad,
            normalized_depth.mediandepth,
            raw_cn_call.d67_cn,
            raw_cn_call.spacer_cn,
            raw_cn_call.d67_depth,
            raw_cn_call.spacer_depth,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            raw_count,
        )
        return sample_call

    # 4. Call CNV and hybrids
    d6_fraction = get_fraction(snp_d6, snp_d7)
    raw_d6_cn = [round(raw_cn_call.d67_cn * a, 3) for a in d6_fraction]
    cn_call_snp = call_cn_snp(raw_cn_call.d67_cn, snp_d6, snp_d7)

    # exon9gc
    exon9gc_call_stringent = call_exon9gc(
        snp_d6[EXON9_SITE1 : EXON9_SITE2 + 1],
        snp_d7[EXON9_SITE1 : EXON9_SITE2 + 1],
        raw_cn_call.d67_cn,
    )
    cnvtag, consensus = get_cnvtag(
        raw_cn_call.d67_cn,
        raw_d6_cn,
        cn_call_snp,
        exon9gc_call_stringent,
        raw_cn_call.spacer_cn,
    )

    # no-call due to CNV group calling
    if cnvtag is None or cnvtag not in CNV_ACCEPTED:
        sample_call = d6_call(
            normalized_depth.mad,
            normalized_depth.mediandepth,
            raw_cn_call.d67_cn,
            raw_cn_call.spacer_cn,
            raw_cn_call.d67_depth,
            raw_cn_call.spacer_depth,
            None,
            cnvtag,
            None,
            None,
            None,
            None,
            exon9gc_call_stringent,
            ",".join(str(a) for a in consensus),
            ",".join(str(a) for a in cn_call_snp),
            ",".join(str(a) for a in raw_d6_cn),
            raw_count,
        )
        return sample_call

    # 5. Call variants
    # homology region
    cn_call_var_homo = call_cn_var_homo(raw_cn_call.d67_cn, var_homo_alt, var_homo_ref)
    # non-homology region
    cn_call_var = call_cn_var(
        cnvtag, var_alt, var_ref, var_alt_forward, var_alt_reverse, var_list, var_db
    )
    # call haplotypes
    haplotype_db = call_parameters.haplotype_db
    site42126938_count, var42126938, var42126938_G_haplotype = call_var42126938(
        bamfile, raw_cn_call.d67_cn, haplotype_db["g.42126938C>T"]
    )
    raw_count.setdefault(
        "g.42126938C>T", "%i,%i" % (site42126938_count[1], site42126938_count[0])
    )

    site42127526_count, site42127556_count, var42127526 = call_var42127526_var42127556(
        bamfile, cnvtag, haplotype_db["g.42127526C>T_g.42127556T>C"]
    )
    raw_count.setdefault(
        "g.42127526C>T", "%i,%i" % (site42127526_count[1], site42127526_count[0])
    )
    raw_count.setdefault(
        "g.42127556T>C", "%i,%i" % (site42127556_count[1], site42127556_count[0])
    )

    var42127803_diff_haplotype = call_var42127803hap(
        bamfile, cnvtag, haplotype_db["g.42127803C>T"]
    )

    # 6. Call star allele
    total_callset = get_called_variants(var_list, cn_call_var)
    called_var_homo = get_called_variants(var_list, cn_call_var_homo, len(cn_call_var))
    total_callset += called_var_homo
    total_callset += var42126938
    total_callset += var42127526

    star_called = match_star(
        total_callset,
        cnvtag,
        raw_cn_call.spacer_cn,
        call_parameters.star_combinations,
        exon9_values(
            exon9gc_call_stringent,
            consensus.exon9_and_downstream,
            raw_d6_cn[EXON9_SITE1],
            raw_d6_cn[EXON9_SITE2],
        ),
        var42126938_G_haplotype,
        var42127803_diff_haplotype,
    )

    genotype_filter = None
    # no-call due to star allele matching
    if "no_match" in star_called[0]:  # or star_called[0] == 'more_than_one_match':
        final_star_allele_call = None
    else:
        final_star_allele_call = star_called[-1]
        if ";" in final_star_allele_call:
            genotype_filter = "More_than_one_possible_genotype"
        elif "/" not in final_star_allele_call:
            genotype_filter = "Not_assigned_to_haplotypes"
        elif high_cn_low_confidence:
            genotype_filter = "LowQ_high_CN"
        else:
            genotype_filter = "PASS"

    sample_call = d6_call(
        normalized_depth.mad,
        normalized_depth.mediandepth,
        raw_cn_call.d67_cn,
        raw_cn_call.spacer_cn,
        raw_cn_call.d67_depth,
        raw_cn_call.spacer_depth,
        star_called.variants_called.split(),
        cnvtag,
        final_star_allele_call,
        genotype_filter,
        star_called.raw_call,
        star_called.call_info,
        exon9gc_call_stringent,
        ",".join(str(a) for a in consensus),
        ",".join(str(a) for a in cn_call_snp),
        ",".join(str(a) for a in raw_d6_cn),
        raw_count,
    )
    bamfile.close()
    return sample_call