def smn_cn_caller( bam, region_dic, gmm_parameter, snp_db, variant_db, threads, count_file=None, reference_fasta=None): """Return SMN CN calls for each sample.""" # 1. read counting, normalization if count_file is not None: bamfile = open_alignment_file(bam, reference_fasta) reads = bamfile.fetch() read_length = get_read_length(reads) bamfile.close() normalized_depth = get_normed_depth_from_count( count_file, region_dic, read_length, gc_correct=False) else: normalized_depth = get_normed_depth( bam, region_dic, threads, reference=reference_fasta, gc_correct=False) # 2. GMM and CN call cn_call = namedtuple( 'cn_call', 'exon16_cn exon16_depth exon78_cn exon78_depth' ) gmm_exon16 = Gmm() gmm_exon16.set_gmm_par(gmm_parameter, 'exon1-6') gcall_exon16 = gmm_exon16.gmm_call(normalized_depth.normalized['exon16']) gmm_exon78 = Gmm() gmm_exon78.set_gmm_par(gmm_parameter, 'exon7-8') gcall_exon78 = gmm_exon78.gmm_call(normalized_depth.normalized['exon78']) raw_cn_call = cn_call( gcall_exon16.cn, gcall_exon16.depth_value, gcall_exon78.cn, gcall_exon78.depth_value ) # 3. Get SNP ratios smn1_read_count, smn2_read_count = get_supporting_reads( bam, snp_db.dsnp1, snp_db.dsnp2, snp_db.nchr, snp_db.dindex, reference=reference_fasta ) smn1_fraction = get_fraction(smn1_read_count, smn2_read_count) var_ref_count, var_alt_count = get_supporting_reads( bam, variant_db.dsnp1, variant_db.dsnp2, variant_db.nchr, variant_db.dindex, reference=reference_fasta ) # 4. Call CN of SMN1 and SMN2 final_call = get_smn12_call( raw_cn_call, smn1_read_count, smn2_read_count, var_ref_count, var_alt_count, normalized_depth.mediandepth ) # 5. Prepare final call set sample_call = namedtuple( 'sample_call', 'Coverage_MAD \ Full_length_CN_raw Total_CN_raw \ SMN1_read_support SMN2_read_support SMN1_fraction \ g27134TG_REF_count g27134TG_ALT_count' ) sample_cn_call = sample_call( round(normalized_depth.mad, 3), raw_cn_call.exon78_depth, raw_cn_call.exon16_depth, smn1_read_count, smn2_read_count, [round(a, 2) for a in smn1_fraction], var_ref_count, var_alt_count ) doutput = sample_cn_call._asdict() doutput.update(final_call._asdict()) return doutput
def d6_star_caller(bam, call_parameters, threads, count_file=None, reference_fasta=None): """Return CYP2D6 star allele diplotype calls for each sample.""" d6_call = namedtuple( "d6_call", "Coverage_MAD Median_depth Total_CN Spacer_CN Total_CN_raw \ Spacer_CN_raw Variants_called CNV_group Genotype Filter Raw_star_allele \ Call_info Exon9_CN CNV_consensus d67_snp_call d67_snp_raw \ Variant_raw_count", ) # 1. Read counting and normalization bamfile = open_alignment_file(bam, reference_fasta) if count_file is not None: reads = bamfile.fetch() read_length = get_read_length(reads) normalized_depth = get_normed_depth_from_count( count_file, call_parameters.region_dic, read_length) else: normalized_depth = get_normed_depth(bam, call_parameters.region_dic, threads, reference=reference_fasta) # no-call after normalizaton if normalized_depth.normalized["d67"] is None: sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, ) return sample_call # 2. GMM and CN call # There are two regions to call CN based on depth: total CYP2D6+CYP2D7, and CYP2D7 spacer region cn_call = namedtuple("cn_call", "d67_cn d67_depth spacer_cn spacer_depth") gmm_d67 = Gmm() gmm_d67.set_gmm_par(call_parameters.gmm_parameter, "d67") gcall_d67 = gmm_d67.gmm_call(normalized_depth.normalized["d67"]) gmm_spacer = Gmm() gmm_spacer.set_gmm_par(call_parameters.gmm_parameter, "spacer") gcall_spacer = gmm_spacer.gmm_call(normalized_depth.normalized["spacer"]) high_cn_low_confidence = False if gcall_d67.cn is None and gcall_d67.depth_value > HIGH_CN_DEPTH_THRESHOLD: high_cn_low_confidence = True raw_cn_call = cn_call( int(round(gcall_d67.depth_value)), gcall_d67.depth_value, gcall_spacer.cn, gcall_spacer.depth_value, ) else: raw_cn_call = cn_call( gcall_d67.cn, gcall_d67.depth_value, gcall_spacer.cn, gcall_spacer.depth_value, ) # 3. Get allele counts at D6/D7 SNP (base difference) sites and target variant sites # D6/D7 base difference sites. Get read counts at both D6/D7 positions. snp_db = call_parameters.snp_db snp_d6, snp_d7 = get_supporting_reads( bam, snp_db.dsnp1, snp_db.dsnp2, snp_db.nchr, snp_db.dindex, reference=reference_fasta, ) site42126938 = [snp_d6[VAR42126938_SITE], snp_d7[VAR42126938_SITE]] snp_d6.pop(VAR42126938_SITE) snp_d6.pop(VAR42126938_SITE - 1) snp_d7.pop(VAR42126938_SITE) snp_d7.pop(VAR42126938_SITE - 1) # Variants not in homology regions. Get read counts only at D6 positions. var_db = call_parameters.var_db var_alt, var_ref = get_supporting_reads_single_region( bam, var_db.dsnp1, var_db.nchr, var_db.dindex, reference=reference_fasta) # Look more carefully for insertions at 42128936 from reads var_list = call_parameters.var_list ref_read, long_ins_read, short_ins_read = get_allele_counts_42128936( bamfile, call_parameters.genome) if "g.42128936-42128937insGGGGCGAAAGGGGCGAAA" in var_list: long_ins_index = var_list.index( "g.42128936-42128937insGGGGCGAAAGGGGCGAAA") var_alt[long_ins_index] = long_ins_read var_ref[long_ins_index] = short_ins_read + ref_read if "g.42128936-42128937insGGGGCGAAA" in var_list: short_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAA") var_alt[short_ins_index] = short_ins_read var_ref[short_ins_index] = long_ins_read + ref_read # Variants in homology regions. Get read counts at both D6/D7 positions. var_homo_db = call_parameters.var_homo_db var_homo_alt, var_homo_ref = get_supporting_reads( bam, var_homo_db.dsnp1, var_homo_db.dsnp2, var_homo_db.nchr, var_homo_db.dindex, reference=reference_fasta, ) # This ordered dictionary is for final reporting. raw_count = OrderedDict() for i in range(len(call_parameters.var_list)): if i < len(var_alt): raw_count.setdefault(var_list[i], "%i,%i" % (var_alt[i], var_ref[i])) else: raw_count.setdefault( var_list[i], "%i,%i" % (var_homo_alt[i - len(var_alt)], var_homo_ref[i - len(var_alt)]), ) raw_count.setdefault("g.42126938C>T", "%i,%i" % (site42126938[0], site42126938[1])) # no-call due to total copy number calling if raw_cn_call.d67_cn is None: sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, raw_cn_call.d67_cn, raw_cn_call.spacer_cn, raw_cn_call.d67_depth, raw_cn_call.spacer_depth, None, None, None, None, None, None, None, None, None, None, raw_count, ) return sample_call # 4. Call CNV and hybrids d6_fraction = get_fraction(snp_d6, snp_d7) raw_d6_cn = [round(raw_cn_call.d67_cn * a, 3) for a in d6_fraction] cn_call_snp = call_cn_snp(raw_cn_call.d67_cn, snp_d6, snp_d7) # exon9gc exon9gc_call_stringent = call_exon9gc(snp_d6[EXON9_SITE1], snp_d7[EXON9_SITE1], raw_cn_call.d67_cn) cnvtag, consensus = get_cnvtag( raw_cn_call.d67_cn, raw_d6_cn, cn_call_snp, exon9gc_call_stringent, raw_cn_call.spacer_cn, ) # no-call due to CNV group calling if cnvtag is None or cnvtag not in CNV_ACCEPTED: sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, raw_cn_call.d67_cn, raw_cn_call.spacer_cn, raw_cn_call.d67_depth, raw_cn_call.spacer_depth, None, cnvtag, None, None, None, None, exon9gc_call_stringent, ",".join(str(a) for a in consensus), ",".join(str(a) for a in cn_call_snp), ",".join(str(a) for a in raw_d6_cn), raw_count, ) return sample_call # 5. Call variants # homology region cn_call_var_homo = call_cn_var_homo(raw_cn_call.d67_cn, var_homo_alt, var_homo_ref) # non-homology region cn_call_var = call_cn_var(cnvtag, var_alt, var_ref, var_list, var_db) # call g.42126938C>T if cnvtag in ["star5", "cn2"]: var42126938, G_haplotype = call_var42126938( bamfile, cnvtag, site42126938, snp_db, [VAR42126938_SITE - 2, VAR42126938_SITE - 1, VAR42126938_SITE], ) else: var42126938 = [] G_haplotype = False # 6. Call star allele total_callset = get_called_variants(var_list, cn_call_var) called_var_homo = get_called_variants(var_list, cn_call_var_homo, len(cn_call_var)) total_callset += called_var_homo total_callset += var42126938 exon9_values = namedtuple( "exon9_values", "exon9_cn exon9cn_in_consensus exon9_raw_site1 exon9_raw_site2") star_called = match_star( total_callset, cnvtag, raw_cn_call.spacer_cn, call_parameters.star_combinations, exon9_values( exon9gc_call_stringent, consensus.exon9_and_downstream, raw_d6_cn[EXON9_SITE1], raw_d6_cn[EXON9_SITE2], ), ) genotype_filter = None # no-call due to star allele matching if "no_match" in star_called[ 0]: # or star_called[0] == 'more_than_one_match': final_star_allele_call = None elif (star_called[0] == "more_than_one_match" and star_called[-1] == "*1/*32;*27/*41"): genotype_filter = "PASS" if G_haplotype: # Variants are on the sample haplotype final_star_allele_call = "*1/*32" else: final_star_allele_call = "*27/*41" else: final_star_allele_call = star_called[-1] if ";" in final_star_allele_call: genotype_filter = "More_than_one_possible_genotype" elif "/" not in final_star_allele_call: genotype_filter = "Not_assigned_to_haplotypes" elif high_cn_low_confidence: genotype_filter = "LowQ_high_CN" else: genotype_filter = "PASS" sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, raw_cn_call.d67_cn, raw_cn_call.spacer_cn, raw_cn_call.d67_depth, raw_cn_call.spacer_depth, star_called.variants_called.split(), cnvtag, final_star_allele_call, genotype_filter, star_called.raw_call, star_called.call_info, exon9gc_call_stringent, ",".join(str(a) for a in consensus), ",".join(str(a) for a in cn_call_snp), ",".join(str(a) for a in raw_d6_cn), raw_count, ) bamfile.close() return sample_call
def d6_star_caller( bam, call_parameters, threads, count_file=None, reference_fasta=None, index_name=None ): """Return CYP2D6 star allele diplotype calls for each sample.""" d6_call = namedtuple( "d6_call", "Coverage_MAD Median_depth Total_CN Spacer_CN Total_CN_raw \ Spacer_CN_raw Variants_called CNV_group Genotype Filter Raw_star_allele \ Call_info Exon9_CN CNV_consensus d67_snp_call d67_snp_raw \ Variant_raw_count", ) # 1. Read counting and normalization bamfile = open_alignment_file(bam, reference_fasta, index_filename=index_name) if count_file is not None: reads = bamfile.fetch() read_length = get_read_length(reads) normalized_depth = get_normed_depth_from_count( count_file, call_parameters.region_dic, read_length ) else: normalized_depth = get_normed_depth( bam, call_parameters.region_dic, threads, reference=reference_fasta ) # no-call after normalizaton if normalized_depth.normalized["d67"] is None: sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, ) return sample_call # 2. GMM and CN call # There are two regions to call CN based on depth: total CYP2D6+CYP2D7, and CYP2D7 spacer region cn_call = namedtuple("cn_call", "d67_cn d67_depth spacer_cn spacer_depth") gmm_d67 = Gmm() gmm_d67.set_gmm_par(call_parameters.gmm_parameter, "d67") gcall_d67 = gmm_d67.gmm_call(normalized_depth.normalized["d67"]) gmm_spacer = Gmm() gmm_spacer.set_gmm_par(call_parameters.gmm_parameter, "spacer") gcall_spacer = gmm_spacer.gmm_call(normalized_depth.normalized["spacer"]) high_cn_low_confidence = False if gcall_d67.cn is None and gcall_d67.depth_value > HIGH_CN_DEPTH_THRESHOLD: high_cn_low_confidence = True raw_cn_call = cn_call( int(round(gcall_d67.depth_value)), gcall_d67.depth_value, gcall_spacer.cn, gcall_spacer.depth_value, ) else: raw_cn_call = cn_call( gcall_d67.cn, gcall_d67.depth_value, gcall_spacer.cn, gcall_spacer.depth_value, ) # 3. Get allele counts at D6/D7 SNP (base difference) sites and target variant sites # D6/D7 base difference sites. Get read counts at both D6/D7 positions. snp_db = call_parameters.snp_db snp_d6, snp_d7 = get_supporting_reads( bamfile, snp_db.dsnp1, snp_db.dsnp2, snp_db.nchr, snp_db.dindex ) # Variants not in homology regions. Get read counts only at D6 positions. var_db = call_parameters.var_db var_alt, var_ref, var_alt_forward, var_alt_reverse = get_supporting_reads_single_region( bamfile, var_db.dsnp1, var_db.nchr, var_db.dindex ) # Look more carefully for insertions at 42128936 from reads var_list = call_parameters.var_list ref_read, long_ins_read, short_ins_read = get_allele_counts_var42128936( bamfile, call_parameters.genome ) var_alt, var_ref = update_var42128936( var_list, var_alt, var_ref, ref_read, long_ins_read, short_ins_read ) # Variants in homology regions. Get read counts at both D6/D7 positions. var_homo_db = call_parameters.var_homo_db var_homo_alt, var_homo_ref = get_supporting_reads( bamfile, var_homo_db.dsnp1, var_homo_db.dsnp2, var_homo_db.nchr, var_homo_db.dindex, ) # This ordered dictionary is for final reporting. raw_count = OrderedDict() non_homology_variant_count = len(var_alt) for i in range(len(call_parameters.var_list)): if i < non_homology_variant_count: if var_list[i] in NOISY_VAR: raw_count.setdefault( var_list[i], "%i(%i:%i),%i" % (var_alt[i], var_alt_forward[i], var_alt_reverse[i], var_ref[i]), ) else: raw_count.setdefault(var_list[i], "%i,%i" % (var_alt[i], var_ref[i])) else: raw_count.setdefault( var_list[i], "%i,%i" % ( var_homo_alt[i - non_homology_variant_count], var_homo_ref[i - non_homology_variant_count], ), ) # no-call due to total copy number calling if raw_cn_call.d67_cn is None: sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, raw_cn_call.d67_cn, raw_cn_call.spacer_cn, raw_cn_call.d67_depth, raw_cn_call.spacer_depth, None, None, None, None, None, None, None, None, None, None, raw_count, ) return sample_call # 4. Call CNV and hybrids d6_fraction = get_fraction(snp_d6, snp_d7) raw_d6_cn = [round(raw_cn_call.d67_cn * a, 3) for a in d6_fraction] cn_call_snp = call_cn_snp(raw_cn_call.d67_cn, snp_d6, snp_d7) # exon9gc exon9gc_call_stringent = call_exon9gc( snp_d6[EXON9_SITE1 : EXON9_SITE2 + 1], snp_d7[EXON9_SITE1 : EXON9_SITE2 + 1], raw_cn_call.d67_cn, ) cnvtag, consensus = get_cnvtag( raw_cn_call.d67_cn, raw_d6_cn, cn_call_snp, exon9gc_call_stringent, raw_cn_call.spacer_cn, ) # no-call due to CNV group calling if cnvtag is None or cnvtag not in CNV_ACCEPTED: sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, raw_cn_call.d67_cn, raw_cn_call.spacer_cn, raw_cn_call.d67_depth, raw_cn_call.spacer_depth, None, cnvtag, None, None, None, None, exon9gc_call_stringent, ",".join(str(a) for a in consensus), ",".join(str(a) for a in cn_call_snp), ",".join(str(a) for a in raw_d6_cn), raw_count, ) return sample_call # 5. Call variants # homology region cn_call_var_homo = call_cn_var_homo(raw_cn_call.d67_cn, var_homo_alt, var_homo_ref) # non-homology region cn_call_var = call_cn_var( cnvtag, var_alt, var_ref, var_alt_forward, var_alt_reverse, var_list, var_db ) # call haplotypes haplotype_db = call_parameters.haplotype_db site42126938_count, var42126938, var42126938_G_haplotype = call_var42126938( bamfile, raw_cn_call.d67_cn, haplotype_db["g.42126938C>T"] ) raw_count.setdefault( "g.42126938C>T", "%i,%i" % (site42126938_count[1], site42126938_count[0]) ) site42127526_count, site42127556_count, var42127526 = call_var42127526_var42127556( bamfile, cnvtag, haplotype_db["g.42127526C>T_g.42127556T>C"] ) raw_count.setdefault( "g.42127526C>T", "%i,%i" % (site42127526_count[1], site42127526_count[0]) ) raw_count.setdefault( "g.42127556T>C", "%i,%i" % (site42127556_count[1], site42127556_count[0]) ) var42127803_diff_haplotype = call_var42127803hap( bamfile, cnvtag, haplotype_db["g.42127803C>T"] ) # 6. Call star allele total_callset = get_called_variants(var_list, cn_call_var) called_var_homo = get_called_variants(var_list, cn_call_var_homo, len(cn_call_var)) total_callset += called_var_homo total_callset += var42126938 total_callset += var42127526 star_called = match_star( total_callset, cnvtag, raw_cn_call.spacer_cn, call_parameters.star_combinations, exon9_values( exon9gc_call_stringent, consensus.exon9_and_downstream, raw_d6_cn[EXON9_SITE1], raw_d6_cn[EXON9_SITE2], ), var42126938_G_haplotype, var42127803_diff_haplotype, ) genotype_filter = None # no-call due to star allele matching if "no_match" in star_called[0]: # or star_called[0] == 'more_than_one_match': final_star_allele_call = None else: final_star_allele_call = star_called[-1] if ";" in final_star_allele_call: genotype_filter = "More_than_one_possible_genotype" elif "/" not in final_star_allele_call: genotype_filter = "Not_assigned_to_haplotypes" elif high_cn_low_confidence: genotype_filter = "LowQ_high_CN" else: genotype_filter = "PASS" sample_call = d6_call( normalized_depth.mad, normalized_depth.mediandepth, raw_cn_call.d67_cn, raw_cn_call.spacer_cn, raw_cn_call.d67_depth, raw_cn_call.spacer_depth, star_called.variants_called.split(), cnvtag, final_star_allele_call, genotype_filter, star_called.raw_call, star_called.call_info, exon9gc_call_stringent, ",".join(str(a) for a in consensus), ",".join(str(a) for a in cn_call_snp), ",".join(str(a) for a in raw_d6_cn), raw_count, ) bamfile.close() return sample_call