#!/software/bin/python ## T. Carstensen (tc9), M.S. Sandhu (ms23), D. Gurdasani (dg11) ## Wellcome Trust Sanger Institute, 2012 import os, math import sys sys.path.append('/nfs/users/nfs_t/tc9/github/ms23/GATK_pipeline') import GATK_pipeline sys.path.append('/nfs/users/nfs_t/tc9/github/tc9/math') import statistics instance_statistics = statistics.tests() def main(): instance = GATK_pipeline.main() d_chromosome_lengths = instance.parse_chromosome_ranges() d_centromere_ranges = parse_centromere_ranges() ped = 'omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped' sepjoin = 'sep' transpose(ped) IMPUTE2_tped(ped,d_chromosome_lengths,sepjoin,d_centromere_ranges,) BEAGLE_tped(ped,d_chromosome_lengths,sepjoin,d_centromere_ranges,) return
def singlevcf_vs_multiplevcfs(): '''count variant call differences between mpileup and unifiedgenotyper''' fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf' fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf' fd2 = open(fp2,'r') ## ## parse first line of second file ## for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue break ## count1 = 0 ## count2 = 0 ## count_intersect = 0 ## count_unique1 = 0 ## count_unique2 = 0 ## for chromosome in [str(i) for i in range(1,22+1,)]+['X','Y',]: ## print chromosome ## l_pos2 = [POS2] ## for line2 in fd2: ## l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) ## if bool_continue == True: continue ## if CHROM2 != chromosome: break ## l_pos2 += [POS2] ## l_pos1 = [] ## fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r') ## for line2 in fd1: ## l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) ## if bool_continue == True: continue ## l_pos1 += [POS2] ## count1 += len(l_pos1) ## count2 += len(l_pos2) ## set1 = set(l_pos1) ## del l_pos1 ## set2 = set(l_pos2) ## del l_pos2 ## count_intersect += len(set1&set2) ## count_unique1 += len(set1-set2) ## count_unique2 += len(set2-set1) ## del set1 ## del set2 ## print count1 ## print count2 ## print count_intersect ## print count_unique1, count1-count_intersect ## print count_unique2, count2-count_intersect ## stop d_QUAL = {} count_intersect = 0 count1 = 0 count2 = 1 l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',] for chromosome in l_chromosomes: d_QUAL[chromosome] = [] fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r') for line1 in fd1: l1, CHROM1, POS1, REF1, ALT1, FILTER1, bool_continue = parse_line_vcf(line1) if bool_continue == True: continue count1 += 1 ## if CHROM1 == '2' or CHROM2 == '2': ## break ## end of vcf2 if CHROM2 == None: continue if CHROM1 != CHROM2: ## loop over lines2 if l_chromosomes.index(CHROM1) > l_chromosomes.index(CHROM2): for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 if CHROM1 != CHROM2: continue ## bool_chromosome_diff = True break ## loop over lines1 else: ## bool_chromosome_diff = True continue if POS1 == POS2: if l1[5] != '999' and l2[5] != '999': d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)] count_intersect += 1 for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 break continue ## loop over lines1 elif POS2 > POS1: if CHROM1 != CHROM2: print 'b', CHROM1, CHROM2 stop continue ## loop over lines2 ## elif POS1 > POS2: else: for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 if CHROM1 != CHROM2: print 'c', CHROM1, CHROM2 stop else: ## loop over lines1 if POS2 > POS1: break elif POS1 == POS2: if l1[5] != '999' and l2[5] != '999': d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)] count_intersect += 1 for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 break break else: continue fd1.close() fd2.close() print count_intersect print count1 print count2 fd = open('QUAL.gnuplotdata','w') for chromosome in d_QUAL.keys(): print chromosome, len(d_QUAL[chromosome]) fd.writelines(d_QUAL[chromosome]) fd.close() gnuplot.scatter_plot_2d( 'QUAL',regression=True, xlabel='QUAL UnifiedGenotyper', ylabel='QUAL mpileup', ) l_QUAL1 = [] l_QUAL2 = [] for chromosome in d_QUAL.keys(): for line in d_QUAL[chromosome]: l = line.split() QUAL1 = float(l[0]) QUAL2 = float(l[1]) l_QUAL1 += [QUAL1] l_QUAL2 += [QUAL2] instance = statistics.tests() r = instance.correlation(l_QUAL1,l_QUAL2,) print r stop return