def doTDT(v, family, thresh): """ Perform the Transmission Disequilibrium Test (TDT). Parameters ---------- v: line of the VCF family is a hash table with: Key: individual id Value: [father id, mother id, sex] thresh: hash table of thresholds """ # If filters on the line failed move on. if not v: return None TU = [0, 0] # Array of [transmissions, untransmissions] TU_m = [0, 0] # same as TU but for males TU_f = [0, 0] # same as TU but for females mErr = 0 # Count mendelian error: Parents: ref, child: het mErr_o = 0 # Count other mendelian errors Nproband_alt = 0 # Number of homozygous alt probands that passed all thresholds AN = 0 # Number of families that passed all thresholds indivs_T = [v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT'] ] # Array of individuals who were transmitted the variant indivs_U = [v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT'] ] # Array of individuals who did not receive the variant for indiv_id in family.keys(): #loop through all the probands # indiv_data is their GT:AD:DP:GQ:PL stats indiv_data = v[indiv_id] if indiv_data == None: continue # Apply quality control filters on proband. if not filters.passFilters(indiv_data, thresh, thresh['GQ_Kid_Thresh']): continue # Apply PL filter to child. if not filters.PhredScaleFilter(indiv_data, thresh['PL_Thresh']): continue father = v[family[indiv_id][0]] mother = v[family[indiv_id][1]] # Check if the parents have the alternate allele # so they can pass it on AND apply quality control filters. if filters.TDT_Parent_Filters(father, mother, thresh): AN += 1 # all individuals in the nuclear family passed the filters # TDT operates differently in the hemizygous chromosomes # PAR regions defined from # http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/ # in this case we are in the Par region so transmission is normal if filters.check_Hemizgyous(v['CHROM'], family[indiv_id][2], filters.inPar(v['POS'])): TU, TU_m, TU_f, mErr, mErr_o, transFlag = numberTransmissions( indiv_data['GT'], father['GT'], mother['GT'], TU, TU_m, TU_f, family[indiv_id][2], False, mErr, mErr_o) else: TU, TU_m, TU_f, mErr, mErr_o, transFlag = numberTransmissions( indiv_data['GT'], father['GT'], mother['GT'], TU, TU_m, TU_f, family[indiv_id][2], True, mErr, mErr_o) if indiv_data['GT'] == 'homoAlt': Nproband_alt += 1 if transFlag == True: # if the variant was transmitted indivs_T.extend( (indiv_id, family[indiv_id][2], indiv_data, family[indiv_id][0], father, family[indiv_id][1], mother)) elif transFlag == False: indivs_U.extend( (indiv_id, family[indiv_id][2], indiv_data, family[indiv_id][0], father, family[indiv_id][1], mother)) # Ignore the cases in which we have 0 transmissions and 0 untransmissions. if TU[0] + TU[1] == 0: return None # Calculate percentage of mendelian errors. mendErrorPercent = (mErr + mErr_o) / (TU[0] + TU[1] + mErr + mErr_o) if vepFieldNames: gene, anno, pph2, sift, lof = vepA.findVariantAnnotation( v, args, vepFieldNames) else: gene, anno, pph2, sift, lof = ('', '', '', '', '') return [ v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT'], v['FILTER'], v['VQSLOD'], gene, anno, pph2, sift, lof, v['AF'], v['AC'], AN, Nproband_alt, TU[0], TU[1], TU_m[0], TU_m[1], TU_f[0], TU_f[1], mErr, mErr_o, mendErrorPercent ], indivs_T, indivs_U
def doTDT(v, family, thresh): """ Perform the Transmission Disequilibrium Test (TDT). Parameters ---------- v: line of the VCF family is a hash table with: Key: individual id Value: [father id, mother id, sex] thresh: hash table of thresholds """ # If filters on the line failed move on. if not v: return None TU = [0, 0] # Array of [transmissions, untransmissions] TU_m = [0, 0] # same as TU but for males TU_f = [0, 0] # same as TU but for females mErr = 0 # Count mendelian error: Parents: ref, child: het mErr_o = 0 # Count other mendelian errors N_het = 0 # Number of heterozygous individuals that passed all thresholds Nproband_alt = 0 # Number of homozygous alt probands that passed all thresholds AN = 0 # Number of families that passed all thresholds DP_het = [] # Array pf depth of all het individuals who passed filters DP = [] # Array of depth of all non-het individuals who passed filters AB = [] # Array of the allelic balance indivs_T = [v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT']] # Array of individuals who were transmitted the variant indivs_U = [v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT']] # Array of individuals who did not receive the variant for indiv_id in family.keys(): #loop through all the probands # indiv_data is their GT:AD:DP:GQ:PL stats indiv_data = v[indiv_id] if indiv_data == None: continue # Apply quality control filters on proband. if not filters.passFilters(indiv_data, thresh, thresh['GQ_Kid_Thresh']): continue # Apply PL filter to child. if not filters.PhredScaleFilter(indiv_data, thresh['PL_Thresh']): continue father = v[family[indiv_id][0]] mother = v[family[indiv_id][1]] # Check if the parents have the alternate allele # so they can pass it on AND apply quality control filters. if filters.TDT_Parent_Filters(father, mother, thresh): AN += 1 # all individuals in the nuclear family passed the filters # TDT operates differently in the hemizygous chromosomes # PAR regions defined from # http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/ # in this case we are in the Par region so transmission is normal if filters.check_Hemizgyous(v['CHROM'], family[indiv_id][2], filters.inPar(v['POS'])): TU, TU_m, TU_f, mErr, mErr_o, transFlag = numberTransmissions(indiv_data['GT'], father['GT'], mother['GT'], TU, TU_m, TU_f, family[indiv_id][2], False, mErr, mErr_o) else: TU, TU_m, TU_f, mErr, mErr_o, transFlag = numberTransmissions(indiv_data['GT'], father['GT'], mother['GT'], TU, TU_m, TU_f, family[indiv_id][2], True, mErr, mErr_o) # Update totals AB, N_het, Nproband_alt, DP, DP_het = updateTotals(AB, N_het, Nproband_alt, DP, DP_het, indiv_data, father, mother) if transFlag == True: # if the variant was transmitted indivs_T.extend((indiv_id, indiv_data, family[indiv_id][0], father, family[indiv_id][1], mother)) elif transFlag == False: indivs_U.extend((indiv_id, indiv_data, family[indiv_id][0], father, family[indiv_id][1], mother)) # Ignore the cases in which we have 0 transmissions and 0 untransmissions. if TU[0] + TU[1] == 0: return None # Calculate percentage of mendelian errors. mendErrorPercent = (mErr + mErr_o) / (TU[0] + TU[1] + mErr + mErr_o) # Calculate averages for allelic balance (AB), depth (DP), and depth of hets (DP_het). AB = np.average(np.array(AB)) DP = np.average(np.concatenate((np.array(DP),np.array(DP_het)))) DP_het = np.average(np.array(DP_het)) gene, anno, pph2, sift, lof = vepA.findVariantAnnotation(v, args, vepFieldNames) return [v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT'], gene, anno, pph2, sift, lof, v['AF'], v['AC'], AN, AB, DP, DP_het, Nproband_alt, TU[0], TU[1], TU_m[0], TU_m[1], TU_f[0], TU_f[1], mErr, mErr_o, mendErrorPercent], indivs_T, indivs_U
def doCaseControl(v, cases, controls, thresh): """ Analyzes case/control data, counting the number of reference and alternate alleles. Parameters ---------- v: line of the VCF case and control are hash tables with: Key: individual id Value: gender thresh is a hash table with: Key: name of threshold Value: threshold """ # If filters on the line failed move on. if not v: return None # Count the number of ref and alt alleles in cases and controls. caseRefs_m = 0 caseAlts_m = 0 controlRefs_m = 0 controlAlts_m = 0 caseRefs_f = 0 caseAlts_f = 0 controlRefs_f = 0 controlAlts_f = 0 # Loop through all the individuals in the case hash table. for indiv_id in cases: # indiv_data is their GT:AD:DP:GQ:PL stats indiv_data = v[indiv_id] if indiv_data == None: continue # Apply filters and update counts Note: cases[indiv_id] is gender if ProcessCC(indiv_data, thresh): parFlag = filters.check_Hemizgyous(v['CHROM'], cases[indiv_id], filters.inPar(v['POS'])) if cases[indiv_id] == 'male': caseRefs_m, caseAlts_m = Counts(indiv_data, caseRefs_m, caseAlts_m, parFlag) elif cases[indiv_id] == 'female': caseRefs_f, caseAlts_f = Counts(indiv_data, caseRefs_f, caseAlts_f, parFlag) # Loop through all indivs in the control hash table. for indiv_id in controls: # indiv_data is their GT:AD:DP:GQ:PL stats indiv_data = v.get(indiv_id) if indiv_data == None: continue # Apply filters and update counts Note: controls[indiv_id] is gender if ProcessCC(indiv_data, thresh): parFlag = filters.check_Hemizgyous(v['CHROM'], controls[indiv_id], filters.inPar(v['POS'])) if controls[indiv_id] == 'male': controlRefs_m, controlAlts_m = Counts(indiv_data, controlRefs_m, controlAlts_m, parFlag) elif controls[indiv_id] == 'female': controlRefs_f, controlAlts_f = Counts(indiv_data, controlRefs_f, controlAlts_f, parFlag) caseAlts = caseAlts_f + caseAlts_m caseRefs = caseRefs_f + caseRefs_m controlAlts = controlAlts_f + controlAlts_m controlRefs = controlRefs_f + controlRefs_m if (caseAlts + controlAlts == 0) | (caseRefs + controlRefs == 0): return None AC = caseAlts + controlAlts AN = AC + caseRefs + controlRefs AF = AC / AN if vepFieldNames: gene, anno, pph2, sift, lof = vepA.findVariantAnnotation( v, args, vepFieldNames) else: gene, anno, pph2, sift, lof = ('', '', '', '', '') return [ v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT'], v['FILTER'], v['VQSLOD'], gene, anno, pph2, sift, lof, AF, AC, AN, caseRefs, caseAlts, controlRefs, controlAlts, caseRefs_m, caseAlts_m, controlRefs_m, controlAlts_m, caseRefs_f, caseAlts_f, controlRefs_f, controlAlts_f ]
def doCaseControl(v, cases, controls, thresh): """ Analyzes case/control data, counting the number of reference and alternate alleles. Parameters ---------- v: line of the VCF case and control are hash tables with: Key: individual id Value: gender thresh is a hash table with: Key: name of threshold Value: threshold """ # If filters on the line failed move on. if not v: return None # Count the number of ref and alt alleles in cases and controls. caseRefs = 0 caseAlts = 0 controlRefs = 0 controlAlts = 0 # Loop through all the individuals in the case hash table. for indiv_id in cases: # indiv_data is their GT:AD:DP:GQ:PL stats indiv_data = v[indiv_id] if indiv_data == None: continue # Apply filters and update counts Note: cases[indiv_id] is gender if ProcessCC(indiv_data, thresh): parFlag = filters.check_Hemizgyous(v['CHROM'], cases[indiv_id], filters.inPar(v['POS']) ) caseRefs, caseAlts = Counts(indiv_data, caseRefs, caseAlts, parFlag) # Loop through all indivs in the control hash table. for indiv_id in controls: # indiv_data is their GT:AD:DP:GQ:PL stats indiv_data = v.get(indiv_id) if indiv_data == None: continue # Apply filters and update counts Note: controls[indiv_id] is gender if ProcessCC(indiv_data, thresh): parFlag = filters.check_Hemizgyous(v['CHROM'], controls[indiv_id], filters.inPar(v['POS']) ) controlRefs, controlAlts = Counts(indiv_data, controlRefs, controlAlts, parFlag) if (caseAlts + controlAlts == 0) | (caseRefs + controlRefs == 0): return None AC = caseAlts + controlAlts AN = AC + caseRefs + controlRefs AF = AC / AN gene, anno, pph2, sift, lof = vepA.findVariantAnnotation(v, args, vepFieldNames) return [v['CHROM'], v['POS'], v['ID'], v['REF'], v['ALT'], gene, anno, pph2, sift, lof, AF, AC, AN, caseRefs, caseAlts, controlRefs, controlAlts]