Exemple #1
0
#!/software/bin/python

## T. Carstensen (tc9), M.S. Sandhu (ms23), D. Gurdasani (dg11)
## Wellcome Trust Sanger Institute, 2012

import os, math
import sys
sys.path.append('/nfs/users/nfs_t/tc9/github/ms23/GATK_pipeline')
import GATK_pipeline
sys.path.append('/nfs/users/nfs_t/tc9/github/tc9/math')
import statistics
instance_statistics = statistics.tests()

def main():

    instance = GATK_pipeline.main()
    d_chromosome_lengths = instance.parse_chromosome_ranges()
    d_centromere_ranges = parse_centromere_ranges()

    ped = 'omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped'
    sepjoin = 'sep'

    transpose(ped)

    IMPUTE2_tped(ped,d_chromosome_lengths,sepjoin,d_centromere_ranges,)

    BEAGLE_tped(ped,d_chromosome_lengths,sepjoin,d_centromere_ranges,)

    return

Exemple #2
0
def singlevcf_vs_multiplevcfs():

    '''count variant call differences between mpileup and unifiedgenotyper'''

    fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf'
    fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf'

    fd2 = open(fp2,'r')

    ##
    ## parse first line of second file
    ##
    for line2 in fd2:
        l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
        if bool_continue == True:
            continue
        break

##    count1 = 0
##    count2 = 0
##    count_intersect = 0
##    count_unique1 = 0
##    count_unique2 = 0
##    for chromosome in [str(i) for i in range(1,22+1,)]+['X','Y',]:
##        print chromosome
##        l_pos2 = [POS2]
##        for line2 in fd2:
##            l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
##            if bool_continue == True: continue
##            if CHROM2 != chromosome: break
##            l_pos2 += [POS2]
##        l_pos1 = []
##        fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r')    
##        for line2 in fd1:
##            l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
##            if bool_continue == True: continue
##            l_pos1 += [POS2]
##        count1 += len(l_pos1)
##        count2 += len(l_pos2)
##        set1 = set(l_pos1)
##        del l_pos1
##        set2 = set(l_pos2)
##        del l_pos2
##        count_intersect += len(set1&set2)
##        count_unique1 += len(set1-set2)
##        count_unique2 += len(set2-set1)
##        del set1
##        del set2
##    print count1
##    print count2
##    print count_intersect
##    print count_unique1, count1-count_intersect
##    print count_unique2, count2-count_intersect
##    stop

    d_QUAL = {}
    count_intersect = 0
    count1 = 0
    count2 = 1
    l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',]
    for chromosome in l_chromosomes:
        d_QUAL[chromosome] = []

        fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r')
        for line1 in fd1:
            l1, CHROM1, POS1, REF1, ALT1, FILTER1, bool_continue = parse_line_vcf(line1)
            if bool_continue == True:
                continue

            count1 += 1

##            if CHROM1 == '2' or CHROM2 == '2':
##                break

            ## end of vcf2
            if CHROM2 == None:
                continue

            if CHROM1 != CHROM2:
                ## loop over lines2
                if l_chromosomes.index(CHROM1) > l_chromosomes.index(CHROM2):
                    for line2 in fd2:
                        l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                        if bool_continue == True:
                            continue
                        count2 += 1
                        if CHROM1 != CHROM2:
                            continue
    ##                    bool_chromosome_diff = True
                        break
                ## loop over lines1
                else:
    ##                bool_chromosome_diff = True
                    continue

            if POS1 == POS2:
                if l1[5] != '999' and l2[5] != '999':
                    d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)]
                count_intersect += 1
                for line2 in fd2:
                    l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                    if bool_continue == True:
                        continue
                    count2 += 1
                    break
                continue

            ## loop over lines1
            elif POS2 > POS1:
                if CHROM1 != CHROM2:
                    print 'b', CHROM1, CHROM2
                    stop
                continue

            ## loop over lines2
##            elif POS1 > POS2:
            else:
                for line2 in fd2:
                    l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                    if bool_continue == True:
                        continue
                    count2 += 1
                    if CHROM1 != CHROM2:
                        print 'c', CHROM1, CHROM2
                        stop
                    else:
                        ## loop over lines1
                        if POS2 > POS1:
                            break
                        elif POS1 == POS2:
                            if l1[5] != '999' and l2[5] != '999':
                                d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)]
                            count_intersect += 1
                            for line2 in fd2:
                                l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                                if bool_continue == True:
                                    continue
                                count2 += 1
                                break
                            break
                        else:
                            continue

        fd1.close()

    fd2.close()

    print count_intersect
    print count1
    print count2

    fd = open('QUAL.gnuplotdata','w')
    for chromosome in d_QUAL.keys():
        print chromosome, len(d_QUAL[chromosome])
        fd.writelines(d_QUAL[chromosome])
    fd.close()
    gnuplot.scatter_plot_2d(
        'QUAL',regression=True,
        xlabel='QUAL UnifiedGenotyper',
        ylabel='QUAL mpileup',
        )

    l_QUAL1 = []
    l_QUAL2 = []
    for chromosome in d_QUAL.keys():
        for line in d_QUAL[chromosome]:
            l = line.split()
            QUAL1 = float(l[0])
            QUAL2 = float(l[1])
            l_QUAL1 += [QUAL1]
            l_QUAL2 += [QUAL2]
    instance = statistics.tests()
    r = instance.correlation(l_QUAL1,l_QUAL2,)
    print r

    stop

    return