Example #1
0
def plot_length_distribution(
    pop,
    coverage,
    d_lengths,
):

    for key in d_lengths.keys():

        fn = 'lengths_%s%s_%s' % (pop, coverage, key)
        fd = open(fn, 'w')
        fd.write(d_lengths[key])
        fd.close()

        gnuplot.histogram2(
            fn,
            x_step=1,
            x_max=12,
            xlabel='INDEL length',
            ylabel='INDEL count',
            color='blue',
            title=key,
        )


##    for key in d_lengths.keys():
##
##        x = d_lengths[key].split('\n')
##
##        r.png('lengths_%s_%s.png' %(pop,key))
##        r.hist(x, main='A histogram', xlab='x', col='lightblue')
##        r.dev_off()

    return
Example #2
0
def plot_length_distribution(pop,coverage,d_lengths,ts_filter_level,):

    for key in d_lengths.keys():

        fn = 'lengths_%s%s_%4.1f_%s' %(pop,coverage,ts_filter_level,key)
        fd = open(fn,'w')
        fd.write(d_lengths[key])
        fd.close()

        gnuplot.histogram2(
            fn,
            x_step=1,
            x_max=12,
            xlabel='INDEL length',
            ylabel='INDEL count',
            color = 'blue',
            title = key,
            )

##    for key in d_lengths.keys():
##
##        x = d_lengths[key].split('\n')
##
##        r.png('lengths_%s_%s.png' %(pop,key))
##        r.hist(x, main='A histogram', xlab='x', col='lightblue')
##        r.dev_off()     

    return
def frq_discordant():

    for chip in ['quad','octo',]:

        cmd = 'cat pops/Baganda_%s/Baganda_%s.SNPQC.fam | wc -l' %(chip,chip,)
        n_samples = int(os.popen(cmd).read())

        for flag,prefix in [
            ['','discordant',],
            ['-v','concordant',],
            ]:

            for suffix,column,x_min,x_max in [
                ['frq','$5',0,0.5,],
                ['lmiss','1-$5',0.90,1.00,],
                ]:

                cmd = 'fgrep %s -w -f discordant.SNPs pops/Baganda_%s/Baganda_%s.SNPQC.%s' %(
                    flag,chip,chip,suffix,)
                cmd += " | awk '{print %s}' > %s.%s.%s" %(
                    column,suffix,chip,prefix,)
                execmd(cmd)

                cmd = 'cat %s.%s.%s | wc -l' %(suffix,chip,prefix,)
                n_SNPs = int(os.popen(cmd).read())
                if flag == '-v':
                    n_SNPs -= 1

                gnuplot.histogram2(
                    '%s.%s.%s' %(suffix,chip,prefix,),
                    x_step = 0.01,
                    x_min = x_min, x_max = x_max,
                    xlabel='MAF after sample QC',
                    title='Baganda %s (n_{samples}=%i, n_{SNPs}=%i)' %(
                        chip,n_samples,n_SNPs,
                        ),
                    )

    return
Example #4
0
def count_and_plot():

    chromosome = '1'
    import time
    t1 = time.time()
    for l_fp_in in [
        ['out_GATK/join/CombineVariants.vcf'],
        ['out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf'],
        ['SelectVariants_discordance1.vcf'],
        ['SelectVariants_discordance2.vcf'],
        ['%s.vqsr.filt.vcf' %(chromosome) for chromosome in range(1,23)+['X','Y',]],
        ['SelectVariants_concordance.vcf'],
        ]:

        import time
        t1 = time.time()

        ##
        ## prepare scatter lists
        ##
        l_gnuplot_MAF = []
        l_gnuplot_DP = []
        l_gnuplot_CR = []

        ##
        ## prepare contour dic
        ##
        d_contour = {}
        for AF in xrange(100+1):
            d_contour[AF*0.01] = {}
            for DP in xrange(150+1):
                d_contour[AF*0.01][DP*10.] = 0
        d_contour_CR = {}
        for AF in xrange(100+1):
            d_contour_CR[round(AF*0.01,2)] = {}
            for CR in xrange(100+1):
                d_contour_CR[round(AF*0.01,2)][CR] = 0

        for fp_in in l_fp_in:
            print fp_in
            fd = open(fp_in,'r')

            print fp_in
            for line in fd:
                if line[0] == '#':
                    continue
##                if line.count('./.')+line.count('0/0')+line.count('0/1')+line.count('1/1') != 100:
##                    print line
##                    stop
                CHROM, d_INFO, bool_continue = parse_line(line)
                if bool_continue == True:
                    continue

                CR = 100-line.count('./.')

                DP = int(d_INFO['DP'])
                try:
                    AF = float(d_INFO['AF'])
                except:
                    d_INFO['AF']
                    AF = 'N/A'

                if AF < 0.5:
                    MAF = AF
                else:
                    MAF = 1-AF

                ##
                ## append to list
                ##
                l_gnuplot_DP += [DP]
                l_gnuplot_CR += [CR]
                if fp_in != 'mp15_vqsr.vcf':
                    if AF == 'N/A':
                        stop
                    l_gnuplot_MAF += [MAF]
                    if DP < 1500:
                        d_contour[0.01*round(MAF/0.01,0)][10.*round(DP/10.,0)] += 1

                ##
                ## append to dic
                ##
                d_contour_CR[round(MAF,2)][CR] += 1

                if chromosome != CHROM:
                    t2 = time.time()
                    print fp_in, '%-2s' %(chromosome), '%2is' %(int(t2-t1))
                    chromosome = CHROM
                    t1 = t2

##            if CHROM == '2':
##                break
##            if POS[-1] == '0' and POS[-2] == '0' and POS[-3] == '0' and POS[-4] == '0':
##                print '%2s %9s %6s %4s' %(CHROM, POS, AF, DP,), fp_in
##                break
##            if POS[-1] == '0' and POS[-2] == '0' and POS[-3] == '0' and POS[-4] == '0':
##                print '%2s %9s %6s %4s' %(CHROM, POS, AF, DP,), fp_in

        title = fp_in.replace('_','').replace('out_GATK/','').replace('.vcf','')
        suffix = fp_in.replace('out_GATK','').replace('/','').replace('.vcf','')
        
        gnuplot.histogram2(
            'DP_%s' %(suffix),
            l_data=l_gnuplot_DP,
            x_step=10,x_min=0,x_max=1000,tic_step=100,
            xlabel='DP from VCF',
            ylabel='SNP count',
            title= title,
            )
        gnuplot.histogram2(
            'CR_%s' %(suffix),
            l_data=l_gnuplot_CR,
            x_step=1,x_min=0,x_max=100,tic_step=10,
            xlabel='SNP Call Rate',
            ylabel='SNP count',
            title= title,
            )
        if fp_in != 'mp15_vqsr.vcf':
            gnuplot.histogram2(
                'AF_%s' %(suffix),
                l_data=l_gnuplot_MAF,
                x_min=0,x_max=.5,tic_step=0.05,x_step=0.01,
                xlabel='AF from VCF',
                ylabel='SNP count',
                title = title,
                )

            lines = []
            for AF in xrange(50+1):
                for DP in xrange(150+1):
                    lines += ['%s %s %s\n' %(AF*0.01,DP*10.,d_contour[AF*0.01][DP*10.],)]
                lines += ['\n']
            gnuplot.contour_plot(
                'AFvDP_%s' %(suffix),
                lines,
                title = title,
                xlabel = 'AF from VCF',
                ylabel = 'DP from VCF',
                zlabel = 'count',
                )

            lines = []
            for AF in xrange(50+1):
                for CR in xrange(100+1):
                    lines += ['%s %s %s\n' %(AF*0.01,CR,d_contour_CR[round(AF*0.01,2)][CR],)]
                lines += ['\n']
            gnuplot.contour_plot(
                'AFvCR_%s' %(suffix),
                lines,
                title = title,
                xlabel = 'AF from VCF',
                ylabel = 'Call Rate from VCF',
                zlabel = 'count',
                )

##        t2 = time.time()
##        print t2-t1
##        stop

    return
Example #5
0
def count_unique_and_intersect_vqsr(
    fp_vcf_individual,fp_vcf_combined,fp_map,
    suffix,
    bool_combined1 = False,
    bool_combined2 = True,
    bool_ignore_FILTER1 = False,
    ):

    '''this function assumes that markers in the VCFs are sorted'''

    ## set file paths
    fp2_template = fp2 = fp_vcf_combined
    fp1_template = fp1 = fp_vcf_individual
    fp3 = fp_map

    print fp1
    print fp2
    print fp3

    ## set list of chromosomes (genotype array only contains autosomal SNPs)
    l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',]

    ## set initial chromosome
    chromosome1 = l_chromosomes[0]
    chromosome2 = l_chromosomes[0]
##    chromosome1 = chromosome2 = '22'

    fp1 = fp1.replace('$CHROMOSOME',chromosome1,)
    fp2 = fp2.replace('$CHROMOSOME',chromosome2,)

    ## set booleans before loop
    bool_read1 = True
    bool_read2 = True
    bool_read3 = True
    bool_EOF1 = False
    bool_EOF2 = False
    bool_EOF3 = False

    ## set counters before loop
    count_intersect12 = 0
    count_intersect13 = 0
    count_intersect23 = 0
    count1 = 0
    count2 = 0
    count3 = 0
    count_intersect123 = 0

    l_AF13 = []
    l_AF23 = []
    l_AF3 = []
    l_AF123 = []

    ## open files before loop
    fd1 = open(fp1,'r')
    fd2 = open(fp2,'r')
    fd3 = open(fp3,'r')
    fd3b = open('../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.tped','r')
    
####    fd1.seek(1500000000000)
####    s = fd1.readline()
####    s = fd1.readline()
##    fd2.seek(65000000000)
##    s = fd2.readline()
##    s = fd2.readline()
##    fd3.seek(50000000)
##    s = fd3.readline()
##    s = fd3.readline()

    i = 0

    while True:

        i += 1

        if i % 10000 == 0:
            print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3, '|', bool_read1, bool_read2, bool_read3, '|', bool_EOF1, bool_EOF2, bool_EOF3, i/10000
####            if CHROM1 != None and CHROM2 != None and CHROM3 != None:
####                if abs(l_chromosomes.index(CHROM1)-l_chromosomes.index(CHROM2)) > 1:
####                    print CHROM1, CHROM2
####                    stop1
####                if abs(l_chromosomes.index(CHROM2)-l_chromosomes.index(CHROM3)) > 1:
####                    print CHROM2, CHROM3
####                    stop2
##        if bool_read2 == True and count2-(count_intersect12+count_intersect23+count_intersect123) > 0:
##            print
##            print 2, fp2
##            print CHROM2, '***', POS2, '***'
##            print count2-(count_intersect12+count_intersect23+count_intersect123)
##            print count2, count_intersect12, count_intersect23, count_intersect123
##            print fp1, chromosome1
##            print fp2, chromosome2
##            stop2
####        if bool_read1 == True and count1-(count_intersect12+count_intersect23+count_intersect123) > 0:
####            print
####            print 1, fp1
####            print CHROM1, '***', POS1, '***'
####            print count1-(count_intersect12+count_intersect13+count_intersect123)
####            print count1, count_intersect12, count_intersect13, count_intersect123
####            print fp1, chromosome1
####            print fp2, chromosome2
####            stop1

        if bool_read1 == True:
            if bool_combined1 == True:
                CHROM1, POS1, count1, bool_EOF1 = loop_single_vcf(fd1,count1,)
            else:
                (
                    CHROM1, POS1, count1, fd1, bool_EOF1,
                    chromosome1,
                    ) = loop_multiple_vcf(
                        fd1,count1,fp1_template,chromosome1,l_chromosomes,
                        bool_ignore_FILTER=bool_ignore_FILTER1,
                        )
            bool_read1 = False

        if bool_read2 == True:
            if bool_combined2 == True:
                CHROM2, POS2, count2, bool_EOF2 = loop_single_vcf(fd2,count2,)
            else:
                (
                    CHROM2, POS2, count2, fd2, bool_EOF2,
                    chromosome2,
                    ) = loop_multiple_vcf(
                        fd2,count2,fp2_template,chromosome2,l_chromosomes,
                        )
            bool_read2 = False

        if bool_read3 == True:
            line3 = fd3.readline()
            if line3 == '':
                bool_EOF3 = True
                CHROM3 = None
                POS3 = None
            else:
                l3 = line3.split()
                CHROM3 = l3[0]
                POS3 = int(l3[3])
                count3 += 1

                ## tmp AF
                line3b = fd3b.readline()
                l = line3b.split()[4:]
                AF = l.count(l[0])/184.
                if AF > 0.5:
                    AF = 1-AF
                l_AF3 += [AF]

            bool_read3 = False


        if bool_EOF1 == True and bool_EOF2 == True and bool_EOF3 == True:
            break

##        print POS1, POS2, POS3, CHROM1, CHROM2, CHROM3
##        stop

        ## doing nested if statements is the fastest method of comparison
        ## looping over lines simultaneously to avoid reading all markers into memory

        ##
        ## triple intersection
        ##
##        if POS1 == POS2 == POS3 and CHROM1 == CHROM2 == CHROM3:
        if POS1 == POS2 == POS3:
            if POS1 % 1000 == 0:
                print CHROM1, '%6i' %(POS1/1000), '|',
                print '%8i' %(count_intersect12), '%8i' %(count_intersect13), '%8i' %(count_intersect23), '|',
                print '%8i' %(count_intersect123), '|',
                print '%8i' %(count1), '%8i' %(count2), '%8i' %(count3)
##            count_intersection12 += 1
##            count_intersection13 += 1
##            count_intersection23 += 1
            count_intersect123 += 1
            l_AF123 += [AF]
            bool_read1 = True
            bool_read2 = True
            bool_read3 = True
        ##
        ## double intersection
        ##
        else:
            ## it is faster to do nesting of logical statements
            ## when comparing long integers
            if POS1 == POS2 != None:
                if bool_EOF3 == True:
                    count_intersect12 += 1
                    bool_read1 = True
                    bool_read2 = True
                elif CHROM1 == CHROM3 and POS1 < POS3:
                    count_intersect12 += 1
                    bool_read1 = True
                    bool_read2 = True
                elif CHROM1 == CHROM3:
                    bool_read3 = True
                else:
                    if l_chromosomes.index(CHROM3) < l_chromosomes.index(CHROM1):
                        bool_read3 = True
                    else:
                        count_intersect12 += 1
                        bool_read1 = True
                        bool_read2 = True
            elif POS1 == POS3 != None:
                if bool_EOF2 == True:
                    count_intersect13 += 1
                    l_AF13 += [AF]
                    bool_read1 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2 and POS1 < POS2:
                    count_intersect13 += 1
                    l_AF13 += [AF]
                    bool_read1 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2:
                    bool_read2 = True
                else:
                    print CHROM1, CHROM2
                    stop2
            elif POS2 == POS3 != None:
                if bool_EOF1 == True:
                    count_intersect23 += 1
                    l_AF23 += [AF]
                    bool_read2 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2 and POS2 < POS1:
                    count_intersect23 += 1
                    l_AF23 += [AF]
                    bool_read2 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2:
                    bool_read1 = True
                else:
                    if l_chromosomes.index(CHROM1) < l_chromosomes.index(CHROM2):
                        bool_read1 = True
                    else:
                        count_intersect23 += 1
                        bool_read2 = True
                        bool_read3 = True
                        stop3tmp_wegethereornot
            ##
            ## no intersection
            ##
            else:
                ## different chromosomes
                if (
                    (CHROM1 != CHROM2)
                    or
                    (bool_EOF3 == False and CHROM2 != CHROM3)
                    ):
                    l_indexes= []
                    if bool_EOF1 == False:
                        index1 = l_chromosomes.index(CHROM1)
                        l_indexes += [index1]
                    if bool_EOF2 == False:
                        index2 = l_chromosomes.index(CHROM2)
                        l_indexes += [index2]
                    if bool_EOF3 == False:
                        index3 = l_chromosomes.index(CHROM3)
                        l_indexes += [index3]
                    min_index = min(l_indexes)
                    if bool_EOF1 == False and index1 == min_index:
                        bool_read1 = True
                    if bool_EOF2 == False and index2 == min_index:
                        bool_read2 = True
                    if bool_EOF3 == False and index3 == min_index:
                        bool_read3 = True
                ## same chromosome
                else:
                    ## either read 1 or 3
                    if CHROM1 == CHROM2 and POS1 < POS2:
                        if bool_EOF3 == True or POS1 < POS3:
                            bool_read1 = True
                        else:
                            bool_read3 = True
                    elif bool_EOF3 == True:
                        bool_read2 = True
                    ## either read 2 or 3
                    elif CHROM2 == CHROM3:
                        if bool_EOF3 == True or POS2 < POS3:
                            bool_read2 = True
                        else:
                            bool_read3 = True
                    else:
                        print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3
                        stop

    print count1
    print count2
    print count3
    print count_intersect12
    print count_intersect13
    print count_intersect23
    print count_intersect123
    print
    print count1-count_intersect12-count_intersect13-count_intersect123
    print count2-count_intersect12-count_intersect23-count_intersect123
    print count3-count_intersect13-count_intersect23-count_intersect123
    print
    print fp1
    print fp2
    print fp3

    print 'AF3', sum(l_AF3)/len(l_AF3)
    print 'AF13', sum(l_AF13)/len(l_AF13)
    print 'AF23', sum(l_AF23)/len(l_AF23)
    print 'AF123', sum(l_AF123)/len(l_AF123)

    gnuplot.histogram2(
        'AF3',title='MAF distribution - 2.5M chip array',l_data=l_AF3,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF13',title='MAF distribution - 2.5M chip array and HGI SNPs',l_data=l_AF13,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF23',title='MAF distribution - 2.5M chip array and GATK SNPs',l_data=l_AF23,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF123',title='MAF distribution - 2.5M chip array and HGI and GATK SNPs',l_data=l_AF123,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)

    l_AF3 = [str(f) for f in l_AF3]
    l_AF13 = [str(f) for f in l_AF13]
    l_AF23 = [str(f) for f in l_AF23]
    l_AF123 = [str(f) for f in l_AF123]

    fd = open('AF3.txt','w')
    fd.write('\n'.join(l_AF3))
    fd.close()
    fd = open('AF13.txt','w')
    fd.write('\n'.join(l_AF13))
    fd.close()
    fd = open('AF23.txt','w')
    fd.write('\n'.join(l_AF23))
    fd.close()
    fd = open('AF123.txt','w')
    fd.write('\n'.join(l_AF123))
    fd.close()

    gnuplot.venn3(
        i1 = count1-count_intersect12-count_intersect13-count_intersect123,
        i2 = count2-count_intersect12-count_intersect23-count_intersect123,
        i3 = count3-count_intersect13-count_intersect23-count_intersect123,
        i4 = count_intersect12,
        i5 = count_intersect13,
        i6 = count_intersect23,
        i7 = count_intersect123,
        text1 = '%s' %(fp1),
        text2 = '%s' %(fp2),
        text3 = '%s' %(fp3),
        suffix = suffix,
        )

    return
Example #6
0
def main():

    ## bsub -J"count$count" -o count$count.out -e count$count.err python ~/github/ms23/analysis/count_passed_variants.py $count
    ## bsub  -M500000 -R'select[mem>500] rusage[mem=500]' -J"count$count" -o count$count.out -e count$count.err python ~/github/ms23/analysis/count_passed_variants.py $count

##    fd = open('AF3.txt','r')
##    lines = fd.readlines()
##    fd.close()
##    l_AF3 = [float(s) for s in lines]
##    fd = open('AF13.txt','r')
##    lines = fd.readlines()
##    fd.close()
##    l_AF13 = [float(s) for s in lines]
##    fd = open('AF23.txt','r')
##    lines = fd.readlines()
##    fd.close()
##    l_AF23 = [float(s) for s in lines]
##    fd = open('AF123.txt','r')
##    lines = fd.readlines()
##    fd.close()
##    l_AF123 = [float(s) for s in lines]
##
##    import collections
##    AF13_multiset = collections.Counter(l_AF13)
##    AF23_multiset = collections.Counter(l_AF23)
##    AF123_multiset = collections.Counter(l_AF123)
##    AF3_multiset = collections.Counter(l_AF3)
##    print len(l_AF3)
##    l_AF3 = list((AF3_multiset - AF13_multiset).elements())
##    print len(l_AF3), len(l_AF13)
##    AF3_multiset = collections.Counter(l_AF3)
##    l_AF3 = list((AF3_multiset - AF23_multiset).elements())
##    print len(l_AF3), len(l_AF23)
##    AF3_multiset = collections.Counter(l_AF3)
##    l_AF3 = list((AF3_multiset - AF123_multiset).elements())
##    print len(l_AF3), len(l_AF123)
##    stop
##
####    print 'a'
####    for x in l_AF123:
####        l_AF3.remove(x)
####    print 'b'
####    for x in l_AF12:
####        l_AF3.remove(x)
####    print 'c'
####    for x in l_AF13:
####        l_AF3.remove(x)

    gnuplot.histogram2(
        'AF3',title='MAF distribution - 2.5M chip array',l_data=l_AF3,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF13',title='MAF distribution - 2.5M chip array and HGI SNPs',l_data=l_AF13,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF23',title='MAF distribution - 2.5M chip array and GATK SNPs',l_data=l_AF23,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF123',title='MAF distribution - 2.5M chip array and HGI and GATK SNPs',l_data=l_AF123,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)

    stop

    if sys.argv[-1] == '5':
        ## 5) compare mp15 steps
        fp1 = 'out_mp15/beagle/03.merged.vcf'
        fp2 = 'out_mp15/impute2/$CHROMOSOME'
        fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map'
        count_unique_and_intersect_vqsr(
            fp1,fp2,fp3,
            'mp15_BEAGLE_vs_IMPUTE2',
            bool_combined1 = True,
            bool_combined2 = False,
            )
        return

    elif sys.argv[-1] == '4':
        ## 4) compare mp15 steps
        fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf'
        fp2 = 'out_mp15/beagle/03.merged.vcf'
        fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map'
        count_unique_and_intersect_vqsr(
            fp1,fp2,fp3,
            'mp15_VQSR_vs_BEAGLE',
            bool_combined1 = False,
            bool_combined2 = True,
            )
        return

    elif sys.argv[-1] == '3':
        ## 3) compare mp15 steps
##        fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vcf'
        fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vqsr.vcf'
        fp2 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf'
        fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map'
        count_unique_and_intersect_vqsr(
            fp1,fp2,fp3,
            'mp15_pre-VQSR_vs_post-VQSR',
            bool_combined2 = False,
            bool_combined1 = False,
            bool_ignore_FILTER1 = True,
            )
        return

    elif sys.argv[-1] == '2':
        ## 2) compare tc9 steps
        fp1 = 'out_GATK/join/CombineVariants.vcf'
        fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf'
        fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map'
        count_unique_and_intersect_vqsr(
            fp1,fp2,fp3,
            'tc9_pre-VQSR_vs_post-VQSR',
            bool_combined1 = True,
            bool_combined2 = True,
            )
        return

    elif sys.argv[-1] == '1':
        ## 1) compare post-VQSR
        fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf'
        fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf'
        fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map'
        count_unique_and_intersect_vqsr(
            fp1,fp2,fp3,
            'post-VQSR_tc9_vs_mp15',
            bool_combined2 = True,
            bool_combined1 = False,
            )
        return

    elif sys.argv[-1] == '0':
        ## 1) compare pre-VQSR
##        fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vcf'
        fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vqsr.vcf'
        fp2 = 'out_GATK/join/CombineVariants.vcf'
        fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map'
        count_unique_and_intersect_vqsr(
            fp1,fp2,fp3,
            'pre-VQSR_tc9_vs_mp15',
            bool_combined2 = True,
            bool_combined1 = False,
            bool_ignore_FILTER1 = True,
            )
        return

##    t1 = time.time()
##    singlevcf_vs_multiplevcfs()
##    t2 = time.time()
##    print 'time', t2-t1

    count_unique_and_intersect_impute2()
    stop

    count_and_plot()

    return