print "Checking sample consistency and creating observation array..." for line in fin.readlines(): counter += 1 linelist = line.split("\t") chrom = linelist[headlist.index("CHROM")] posit = linelist[headlist.index("POS")] info = linelist[headlist.index("INFO")] #format = linelist[headlist.index("FORMAT")] mq, mq0 = vcfUtils.parse_info(info) if ("#" in line) == 0 and mq >= mapQual and mq0 <= mapQual0 and ( "PASS" in line): #stat_locations = vcfUtils.stat_grabber(linelist geno_father = vcfUtils.allele_coder(linelist[father], depth, altDepth, gtQual, pl, platform) geno_mother = vcfUtils.allele_coder(linelist[mother], depth, altDepth, gtQual, pl, platform) geno_offspring = vcfUtils.allele_coder(linelist[offspring], depth, altDepth, gtQual, pl, platform) alleles2 = geno_father + "," + geno_mother + "," + geno_offspring if ((("3" in alleles2) == 0) and (("4" in alleles2) == 0)): counter += 1 #checks for paternity, maternity if geno_father == "0" and geno_mother == "2": inform_parents += 1
non_maternal = 0 print "Checking sample consistency and creating observation array..." for line in fin.readlines(): counter+=1 linelist = line.split("\t") chrom = linelist[headlist.index("CHROM")] posit = linelist[headlist.index("POS")] info = linelist[headlist.index("INFO")] #format = linelist[headlist.index("FORMAT")] mq, mq0 = vcfUtils.parse_info(info) if ("#" in line) == 0 and mq >= mapQual and mq0 <= mapQual0 and ("PASS" in line): #stat_locations = vcfUtils.stat_grabber(linelist geno_father = vcfUtils.allele_coder(linelist[father], depth, altDepth, gtQual, pl, platform) geno_mother = vcfUtils.allele_coder(linelist[mother], depth, altDepth, gtQual, pl, platform) geno_offspring = vcfUtils.allele_coder(linelist[offspring], depth, altDepth, gtQual, pl, platform) alleles2 = geno_father+","+geno_mother+","+geno_offspring if ((("3" in alleles2) == 0) and (("4" in alleles2) == 0)): counter+=1 #checks for paternity, maternity if geno_father == "0" and geno_mother == "2": inform_parents += 1 if geno_offspring == "2": non_paternal += 1 if geno_offspring == "0": non_maternal += 1
def phase_quartet(fatherID, motherID, offspring1ID, offspring2ID, vcf_in_stem, depth, altDepth, gtQual, mapQual, mapQual0, pl, mat_path, error, compression, mie, selfprob, platform): nonselfprob = str((1-float(selfprob))/float(5)) #create matrices for HMM, all probabilities in log10 space to avoid underflow from floating point transmat, emitmat, startp, states, inmap = hmmUtils.make_hmm(mat_path, selfprob, compression, mie, error, "q") #temp file for inheritance state phase_out = open(".".join([vcf_in_stem, fatherID, motherID, offspring1ID, offspring2ID, "phase_out.txt"]), "w") #chrom_arr = ["21", "22"] chrom_arr = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X"] start_time=time.clock() for chromh in chrom_arr: print "\nWorking on chromosome "+str(chromh)+"\n" fin = open(vcf_in_stem+"_chr"+chromh+".txt", "r") header = fin.readline() #grab proper columns from header for variants headlist = header.replace("\n", "").split("\t") try: offspring1 = headlist.index(offspring1ID) offspring2 = headlist.index(offspring2ID) father = headlist.index(fatherID) mother = headlist.index(motherID) except ValueError: return "Error in pedigreeUtils.phase_quartet: Invalid input for quartet "+":".join([fatherID, motherID, offspring1ID, offspring2ID])+" - Specified subject ID does not match vcf file header, consider malformed header or pedigree file" #arrays for observations and positions obs_seq = [] position_clean = [] position_all = [] res_dict = {} #counters for error modes (DN = de novo allele, HZGC = apparent gene conversions/hemizygosity counterDN1 = 0 counterHZGC1 = 0 counterDN2 = 0 counterHZGC2 = 0 counter = 0 #counters for number of informative allele assortments for determing paternity/maternity inform_parents = 0 non_paternal1 = 0 non_maternal1 = 0 non_paternal2 = 0 non_maternal2 = 0 for line in fin.readlines(): if ("#" in line) == 0: linelist = line.replace("\n", "").split("\t") chrom = linelist[headlist.index("CHROM")] posit = linelist[headlist.index("POS")] info = linelist[headlist.index("INFO")] position_all.append(posit) #temp variable to flag if line should be used, and also check for unsupported data format (currently supports CG and Illumina GATK-style) goline = 0 if ((platform == "ILLUMINA") or (platform == "ILL")): try: mq, mq0 = vcfUtils.parse_info(info) if mq >= mapQual and mq0 <= mapQual0 and ("PASS" in line): goline = 1 except UnboundLocalError: return "Error in pedigreeUtils.phase_quartet for quartet "+":".join([fatherID, motherID, offspring1ID, offspring2ID])+" - Invalid input format: Data does not appear to be in GATK style from Illumina platform" elif (platform == "COMPLETE") or (platform == "CG"): goline = 1 elif (platform == "RTG") or (platform == "rtg"): goline = 1 elif (platform != "COMPLETE") and (platform != "CG") and (platform != "ILLUMINA") and (platform != "ILL"): return "Error in pedigreeUtils.phase_quartet for quartet "+":".join([fatherID, motherID, offspring1ID, offspring2ID])+": Invalid input - unknown or unsupported data format: "+platform if goline == 1: geno_father = vcfUtils.allele_coder(linelist[father], depth, altDepth, gtQual, pl, platform) geno_mother = vcfUtils.allele_coder(linelist[mother], depth, altDepth, gtQual, pl, platform) geno_offspring1 = vcfUtils.allele_coder(linelist[offspring1], depth, altDepth, gtQual, pl, platform) geno_offspring2 = vcfUtils.allele_coder(linelist[offspring2], depth, altDepth, gtQual, pl, platform) alleles2 = geno_father+","+geno_mother+","+geno_offspring1+","+geno_offspring2 if ((("3" in alleles2) == 0) and (("4" in alleles2) == 0)): counter+=1 #checks for paternity, maternity if geno_father == "0" and geno_mother == "2": inform_parents += 1 if geno_offspring1 == "2": non_paternal1 += 1 if geno_offspring1 == "0": non_maternal1 += 1 if geno_offspring2 == "2": non_paternal2 += 1 if geno_offspring2 == "0": non_maternal2 += 1 elif geno_father == "2" and geno_mother == "0": inform_parents += 1 if geno_offspring1 == "0": non_paternal1 += 1 if geno_offspring1 == "2": non_maternal1 += 1 if geno_offspring2 == "0": non_paternal2 += 1 if geno_offspring2 == "2": non_maternal2 += 1 #simple de novo case if alleles2 == "0,0,1,1": counterDN1+=1 counterDN2+=1 elif alleles2 == "0,0,1,0": counterDN1+=1 elif alleles2 == "0,0,0,1": counterDN2+=1 #hemizygosity or gene conversion case elif (geno_father == "0" and geno_mother == "0" and geno_offspring1 == "2") or (geno_father == "1" and geno_mother == "0" and geno_offspring1 == "2") or (geno_father == "0" and geno_mother == "1" and geno_offspring1 == "2") or (geno_father == "1" and geno_mother == "2" and geno_offspring1 == "0") or (geno_father == "2" and geno_mother == "1" and geno_offspring1 == "0"): counterHZGC1+=1 elif (geno_father == "0" and geno_mother == "0" and geno_offspring2 == "2") or (geno_father == "1" and geno_mother == "0" and geno_offspring2 == "2") or (geno_father == "0" and geno_mother == "1" and geno_offspring2 == "2") or (geno_father == "1" and geno_mother == "2" and geno_offspring2 == "0") or (geno_father == "2" and geno_mother == "1" and geno_offspring2 == "0"): counterHZGC2+=1 #creates an array of clean observations for HMM if inmap.has_key(alleles2): obs_seq.append(int(inmap[alleles2])) position_clean.append(posit) fin.close() cent_np1 = float(non_paternal1)/float(inform_parents) cent_nm1 = float(non_maternal1)/float(inform_parents) cent_DN1 = float(counterDN1+counterHZGC1)/float(counter) cent_np2 = float(non_paternal2)/float(inform_parents) cent_nm2 = float(non_maternal2)/float(inform_parents) cent_DN2 = float(counterDN2+counterHZGC2)/float(counter) print "\nAllele QC summary for quartet: "+str(":".join([fatherID, motherID, offspring1ID, offspring2ID])+"\n") print "\nNon-transmitted paternal alleles for child 1 (%): "+str(cent_np1*float(100)) print "Non-transmitted maternal alleles for child 1 (%): "+str(cent_nm1*float(100)) print "Novel alleles for child 1 (%): "+str(cent_DN1*float(100)) print "\nNon-transmitted paternal alleles for child 2 (%): "+str(cent_np2*float(100)) print "Non-transmitted maternal alleles for child 2 (%): "+str(cent_nm2*float(100)) print "Novel alleles for child 2 (%): "+str(cent_DN2*float(100)) #warning for high rates of non-transmission, does not kill process if (cent_np1) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of paternal alleles in child "+offspring1ID+": consider sample mix-up or non-paternity...\n" if (cent_np2) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of paternal alleles in child "+offspring2ID+": consider sample mix-up or non-paternity...\n" if (cent_nm1) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of maternal alleles in child "+offspring1ID+": consider sample mix-up or non-maternity...\n" if (cent_nm2) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of maternal alleles in child "+offspring2ID+": consider sample mix-up or non-maternity...\n" if (cent_DN1) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of novel alleles in child "+offspring1ID+": consider sample mix-up or non-maternity or non-paternity...\n" if (cent_DN2) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of novel alleles in child "+offspring2ID+": consider sample mix-up or non-maternity or non-paternity...\n" #run Viterbi algorithm on HMM defined above vit = hmmUtils.viterbi(np.array(obs_seq), states, startp, transmat, emitmat) vit_path = vit[1] #bind viterbi path to position dict for annotation for i in range(0, len(vit_path)): res_dict[position_clean[i]] = vit_path[i] #temp variables to allow bridging alleles not used in HMM firstpath = lastpath = vit_path[0] #create output dict with state for every position, including those not used in HMM for i in position_all: if res_dict.has_key(i): phase_out.write("chr"+chromh+":"+str(i)+"\t"+str(res_dict[i])+"\n") lastpath = res_dict[i] else: phase_out.write("chr"+chromh+":"+str(i)+"\t"+str(lastpath)+"\n") end_time = time.clock() phase_out.close() return "Normal exit status for quartet "+":".join([fatherID, motherID, offspring1ID, offspring2ID])+". Processing time: "+str((float(end_time)-float(start_time))/float(60))+" minutes"
def phase_quartet(fatherID, motherID, offspring1ID, offspring2ID, vcf_in_stem, depth, altDepth, gtQual, mapQual, mapQual0, pl, mat_path, error, compression, mie, selfprob, platform): nonselfprob = str((1 - float(selfprob)) / float(5)) #create matrices for HMM, all probabilities in log10 space to avoid underflow from floating point transmat, emitmat, startp, states, inmap = hmmUtils.make_hmm( mat_path, selfprob, compression, mie, error, "q") #temp file for inheritance state phase_out = open( ".".join([ vcf_in_stem, fatherID, motherID, offspring1ID, offspring2ID, "phase_out.txt" ]), "w") #chrom_arr = ["21", "22"] chrom_arr = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X" ] start_time = time.clock() for chromh in chrom_arr: print "\nWorking on chromosome " + str(chromh) + "\n" fin = open(vcf_in_stem + "_chr" + chromh + ".txt", "r") header = fin.readline() #grab proper columns from header for variants headlist = header.replace("\n", "").split("\t") try: offspring1 = headlist.index(offspring1ID) offspring2 = headlist.index(offspring2ID) father = headlist.index(fatherID) mother = headlist.index(motherID) except ValueError: return "Error in pedigreeUtils.phase_quartet: Invalid input for quartet " + ":".join( [fatherID, motherID, offspring1ID, offspring2ID] ) + " - Specified subject ID does not match vcf file header, consider malformed header or pedigree file" #arrays for observations and positions obs_seq = [] position_clean = [] position_all = [] res_dict = {} #counters for error modes (DN = de novo allele, HZGC = apparent gene conversions/hemizygosity counterDN1 = 0 counterHZGC1 = 0 counterDN2 = 0 counterHZGC2 = 0 counter = 0 #counters for number of informative allele assortments for determing paternity/maternity inform_parents = 0 non_paternal1 = 0 non_maternal1 = 0 non_paternal2 = 0 non_maternal2 = 0 for line in fin.readlines(): if ("#" in line) == 0: linelist = line.replace("\n", "").split("\t") chrom = linelist[headlist.index("CHROM")] posit = linelist[headlist.index("POS")] info = linelist[headlist.index("INFO")] position_all.append(posit) #temp variable to flag if line should be used, and also check for unsupported data format (currently supports CG and Illumina GATK-style) goline = 0 if ((platform == "ILLUMINA") or (platform == "ILL")): try: mq, mq0 = vcfUtils.parse_info(info) if mq >= mapQual and mq0 <= mapQual0 and ("PASS" in line): goline = 1 except UnboundLocalError: return "Error in pedigreeUtils.phase_quartet for quartet " + ":".join( [fatherID, motherID, offspring1ID, offspring2ID] ) + " - Invalid input format: Data does not appear to be in GATK style from Illumina platform" elif (platform == "COMPLETE") or (platform == "CG"): goline = 1 elif (platform == "RTG") or (platform == "rtg"): goline = 1 elif (platform != "COMPLETE") and (platform != "CG") and ( platform != "ILLUMINA") and (platform != "ILL"): return "Error in pedigreeUtils.phase_quartet for quartet " + ":".join( [fatherID, motherID, offspring1ID, offspring2ID] ) + ": Invalid input - unknown or unsupported data format: " + platform if goline == 1: geno_father = vcfUtils.allele_coder( linelist[father], depth, altDepth, gtQual, pl, platform) geno_mother = vcfUtils.allele_coder( linelist[mother], depth, altDepth, gtQual, pl, platform) geno_offspring1 = vcfUtils.allele_coder( linelist[offspring1], depth, altDepth, gtQual, pl, platform) geno_offspring2 = vcfUtils.allele_coder( linelist[offspring2], depth, altDepth, gtQual, pl, platform) alleles2 = geno_father + "," + geno_mother + "," + geno_offspring1 + "," + geno_offspring2 if ((("3" in alleles2) == 0) and (("4" in alleles2) == 0)): counter += 1 #checks for paternity, maternity if geno_father == "0" and geno_mother == "2": inform_parents += 1 if geno_offspring1 == "2": non_paternal1 += 1 if geno_offspring1 == "0": non_maternal1 += 1 if geno_offspring2 == "2": non_paternal2 += 1 if geno_offspring2 == "0": non_maternal2 += 1 elif geno_father == "2" and geno_mother == "0": inform_parents += 1 if geno_offspring1 == "0": non_paternal1 += 1 if geno_offspring1 == "2": non_maternal1 += 1 if geno_offspring2 == "0": non_paternal2 += 1 if geno_offspring2 == "2": non_maternal2 += 1 #simple de novo case if alleles2 == "0,0,1,1": counterDN1 += 1 counterDN2 += 1 elif alleles2 == "0,0,1,0": counterDN1 += 1 elif alleles2 == "0,0,0,1": counterDN2 += 1 #hemizygosity or gene conversion case elif (geno_father == "0" and geno_mother == "0" and geno_offspring1 == "2") or ( geno_father == "1" and geno_mother == "0" and geno_offspring1 == "2") or ( geno_father == "0" and geno_mother == "1" and geno_offspring1 == "2") or ( geno_father == "1" and geno_mother == "2" and geno_offspring1 == "0") or ( geno_father == "2" and geno_mother == "1" and geno_offspring1 == "0"): counterHZGC1 += 1 elif (geno_father == "0" and geno_mother == "0" and geno_offspring2 == "2") or ( geno_father == "1" and geno_mother == "0" and geno_offspring2 == "2") or ( geno_father == "0" and geno_mother == "1" and geno_offspring2 == "2") or ( geno_father == "1" and geno_mother == "2" and geno_offspring2 == "0") or ( geno_father == "2" and geno_mother == "1" and geno_offspring2 == "0"): counterHZGC2 += 1 #creates an array of clean observations for HMM if inmap.has_key(alleles2): obs_seq.append(int(inmap[alleles2])) position_clean.append(posit) fin.close() cent_np1 = float(non_paternal1) / float(inform_parents) cent_nm1 = float(non_maternal1) / float(inform_parents) cent_DN1 = float(counterDN1 + counterHZGC1) / float(counter) cent_np2 = float(non_paternal2) / float(inform_parents) cent_nm2 = float(non_maternal2) / float(inform_parents) cent_DN2 = float(counterDN2 + counterHZGC2) / float(counter) print "\nAllele QC summary for quartet: " + str( ":".join([fatherID, motherID, offspring1ID, offspring2ID]) + "\n") print "\nNon-transmitted paternal alleles for child 1 (%): " + str( cent_np1 * float(100)) print "Non-transmitted maternal alleles for child 1 (%): " + str( cent_nm1 * float(100)) print "Novel alleles for child 1 (%): " + str(cent_DN1 * float(100)) print "\nNon-transmitted paternal alleles for child 2 (%): " + str( cent_np2 * float(100)) print "Non-transmitted maternal alleles for child 2 (%): " + str( cent_nm2 * float(100)) print "Novel alleles for child 2 (%): " + str(cent_DN2 * float(100)) #warning for high rates of non-transmission, does not kill process if (cent_np1) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of paternal alleles in child " + offspring1ID + ": consider sample mix-up or non-paternity...\n" if (cent_np2) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of paternal alleles in child " + offspring2ID + ": consider sample mix-up or non-paternity...\n" if (cent_nm1) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of maternal alleles in child " + offspring1ID + ": consider sample mix-up or non-maternity...\n" if (cent_nm2) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of non-transmission of maternal alleles in child " + offspring2ID + ": consider sample mix-up or non-maternity...\n" if (cent_DN1) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of novel alleles in child " + offspring1ID + ": consider sample mix-up or non-maternity or non-paternity...\n" if (cent_DN2) > 0.05 and (chromh != 'X'): print "\nWARNING! High rate of novel alleles in child " + offspring2ID + ": consider sample mix-up or non-maternity or non-paternity...\n" #run Viterbi algorithm on HMM defined above vit = hmmUtils.viterbi(np.array(obs_seq), states, startp, transmat, emitmat) vit_path = vit[1] #bind viterbi path to position dict for annotation for i in range(0, len(vit_path)): res_dict[position_clean[i]] = vit_path[i] #temp variables to allow bridging alleles not used in HMM firstpath = lastpath = vit_path[0] #create output dict with state for every position, including those not used in HMM for i in position_all: if res_dict.has_key(i): phase_out.write("chr" + chromh + ":" + str(i) + "\t" + str(res_dict[i]) + "\n") lastpath = res_dict[i] else: phase_out.write("chr" + chromh + ":" + str(i) + "\t" + str(lastpath) + "\n") end_time = time.clock() phase_out.close() return "Normal exit status for quartet " + ":".join([ fatherID, motherID, offspring1ID, offspring2ID ]) + ". Processing time: " + str( (float(end_time) - float(start_time)) / float(60)) + " minutes"