def plot(pdb,):

    import sys
    sys.path.append('/home/people/tc/svn/tc_sandbox/misc/')
    import gnuplot

    prefix = pdb
    gnuplot.scatter_plot_2d(
        prefix,regression=True,xlabel='mode',ylabel='correlation',ymin=-1,ymax=1,
        )

    return
Esempio n. 2
0
def plot(pdb, ):

    import sys
    sys.path.append('/home/people/tc/svn/tc_sandbox/misc/')
    import gnuplot

    prefix = pdb
    gnuplot.scatter_plot_2d(
        prefix,
        regression=True,
        xlabel='mode',
        ylabel='correlation',
        ymin=-1,
        ymax=1,
    )

    return
Esempio n. 3
0
def main():

    for chromosome in range(1,22+1,)+['X','Y',]:
        fp_in = 'out_GATK/sep/ApplyRecalibration.recalibrated.filtered.%s.vcf' %(chromosome)
        fp_out = 'out_VCFtools/freq%s' %(chromosome)
        if os.path.isfile('%s.frq' %(fp_out)):
            continue
        s = 'bsub \
        -M4000000 -R\'select[mem>4000] rusage[mem=4000]\' \
        vcftools \
        --vcf %s \
        --freq \
        --out %s \
        ' %(fp_in, fp_out,)
        os.system(s)

    for chromosome in range(1,22+1,)+['X','Y',]:
        fp_out = 'out_VCFtools/freq%s.frq' %(chromosome)
##        if chromosome <= 11:
##            continue
        print fp_out
        fd = open(fp_out,'r')
        for line in fd: break
        l_pos = []
        l_MAF = []
        for line in fd:
            l = line.split()
            pos = float(l[1])/10**6
            MAF = l[-1][2:]
            l_pos += [pos]
            l_MAF += [MAF]
##            lines += ['%s %s\n' %(pos,MAF,)]
##        fd = open('gnuplot%s.data','w')
##        fd.writelines(lines)
##        fd.close()
        gnuplot.scatter_plot_2d(
            'MAF%s' %(chromosome),
            l_pos, l_MAF,
            xlabel = 'pos (Mbp)',
            ylabel = 'MAF',
            )

    return
        
    for i in range(len(l_overlaps)):
        overlap = l_overlaps[i]
        overlap_normalized = (overlap-min(l_overlaps))/(1.-min(l_overlaps))
        min_dist = l_dist_min[i]
##        if overlap_normalized < 0.5 and min_dist > 10:
##            print pdb, overlap, overlap_normalized, min_dist
        if overlap_normalized < 0.5 and min_dist > 30:
            print pdb, overlap, overlap_normalized, min_dist
        l_gnuplot += ['%s %s %s\n' %(min_dist,overlap_normalized,pdb,)]

print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True)/float(len(l_vicinal))

##print minmindist

prefix = 'gnuplot'

fd = open('%s.gnuplotdata' %(prefix),'w')
fd.writelines(l_gnuplot)
fd.close()

gnuplot.scatter_plot_2d(
    prefix,
##    bool_regression_linear = True,
    xlabel='minimum distance to catalytic site residue(s)',
    ylabel='overlap between apo and holo eigenvectors',
    bool_remove = False,
    )

print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True)/float(len(l_vicinal))
def plot(d_mmCIF_main,d_rmsds,):

    l_pdbs = d_rmsds.keys()
    l_pdbs.sort()

    l_temperature = []
    l_ph = []
    l_resolution = []
    d_spacegroup = {}
    d_starting_model = {}

    l_correl_T = [[],[],]
    l_correl_pH = [[],[],]
    l_correl_resol_max = [[],[],]

    d_histo_pH = {}
    d_histo_T = {}
    d_histo_resol = {}

    for i1 in range(len(l_pdbs)-1):
        pdb1 = l_pdbs[i1]
        spacegroup1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_symmetry.space_group_name_H-M',)
        T1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_diffrn.ambient_temp',)
        pH1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_exptl_crystal_grow.pH',)
        starting_model1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.pdbx_starting_model',)
        resolution1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.ls_d_res_high',)

        for i2 in range(i1+1,len(l_pdbs)):
            pdb2 = l_pdbs[i2]
            spacegroup2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_symmetry.space_group_name_H-M',)
            T2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_diffrn.ambient_temp',)
            pH2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_exptl_crystal_grow.pH',)
            starting_model2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.pdbx_starting_model',)
            resolution2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.ls_d_res_high',)

            rmsd = d_rmsds[pdb1][pdb2]
            if rmsd > 1:
                print pdb1, pdb2, rmsd

            if T1 and T2:
                T_diff = abs(float(T2)-float(T1))
                l_temperature += ['%s %s\n' %(T_diff,rmsd),]
                l_correl_T[0] += [T_diff]
                l_correl_T[1] += [rmsd]

                print T_diff, 10*round(T_diff/10.,0)
                if not 10*round(T_diff/10.,0) in d_histo_T.keys():
                    d_histo_T[10*round(T_diff/10.,0)] = 0
                d_histo_T[10*round(T_diff/10.,0)] += 1

            if pH1 and pH2:
                pH_diff = abs(float(pH2)-float(pH1))
                l_ph += ['%s %s\n' %(pH_diff,rmsd),]
                l_correl_pH[0] += [pH_diff]
                l_correl_pH[1] += [rmsd]

                if not pH_diff in d_histo_pH.keys():
                    d_histo_pH[pH_diff] = 0
                d_histo_pH[pH_diff] += 1

            resolution_max = max(resolution1,resolution2,)
            l_resolution += ['%s %s\n' %(resolution_max,rmsd),]
            if resolution_max != 'N/A':
                l_correl_resol_max[0] += [float(resolution_max)]
                l_correl_resol_max[1] += [rmsd]

                if not round(float(resolution_max),0) in d_histo_resol.keys():
                    d_histo_resol[round(float(resolution_max),0)] = 0
                d_histo_resol[round(float(resolution_max),0)] += 1

            d_spacegroup = append_to_dictionary(d_spacegroup,spacegroup1,spacegroup2,rmsd,)
            d_starting_model = append_to_dictionary(d_starting_model,starting_model1,starting_model2,rmsd,)

    r1 = statistics.correlation(l_correl_T[0],l_correl_T[1],)
    r2 = statistics.correlation(l_correl_pH[0],l_correl_pH[1],)
    r3 = statistics.correlation(l_correl_resol_max[0],l_correl_resol_max[1],)

    ##
    ## plot histograms
    ##
    for prefix,d in [
        ['deltapH',d_histo_pH,],
        ['deltaT',d_histo_T,],
        ['maxresolution',d_histo_resol,],
        ]:
        
        l = []
        l_diffs = d.keys()
        l_diffs.sort()
        for diff in l_diffs:
            l += ['%s %s\n' %(diff,d[diff],)]
        fd = open('histo_%s.txt' %(prefix),'w')
        fd.writelines(l)
        fd.close()

        l = [
            'set terminal postscript eps enhanced color "Helvetica"\n',
            'set output "gnuplot.ps"\n',
            'set size 3,3\n',
            'set style data histogram\n',
            'set xtics rotate\n',
            'set xlabel "%s\n' %(prefix),
            'set ylabel "count\n',
            'plot "histo_%s.txt" u 2:xtic(1) t ""\n' %(prefix)
            ]
        fd = open('tmp.txt','w')
        fd.writelines(l)
        fd.close()

        os.system('gnuplot tmp.txt')
        os.system('convert gnuplot.ps histo_%s.png' %(prefix))

    ##
    ## plot rmsd as a function of each property (2d)
    ##
    for prefix,data,xlabel in [
        ['pH',l_ph,'pH diff',],
        ['Temperature',l_temperature,'T diff',],
        ['resolution',l_resolution,'maximum resolution',],
        ]:
        prefix += method
        fd = open('%s.gnuplotdata' %(prefix),'w')
        fd.writelines(data)
        fd.close()
        gnuplot.scatter_plot_2d(
            prefix,xlabel=xlabel,ylabel='RMSD %s' %(method,),
##            averages=True,
            regression=True,
            )

    ##
    ## plot rmsd as a function of each property (contour)
    ##
    for d,prefix in [
        [d_spacegroup,'spacegroup',],
        [d_starting_model,'startingmodel',],
        ]:

        d_tics = {}
        l_tics = d.keys()
        l_tics.sort()
        for i in range(len(l_tics)):
            d_tics[l_tics[i]] = i+.5
        z1 = 9
        z2 = 0

        l_data = []
        for x in range(len(l_tics)):
            k1 = l_tics[x]
            for y in range(len(l_tics)):
                k2 = l_tics[y]
                if not k2 in d[k1].keys():
                    average = 9
                else:
                    l_rmsds = d[k1][k2]
                    average = sum(l_rmsds)/len(l_rmsds)
                    if average < z1:
                        z1 = average
                    if average > z2:
                        z2 = average
                l_data += ['%s %s %s\n' %(x,y,average,)]
            l_data += ['%s %s %s\n' %(x,y+1,1,)]
            l_data += ['\n']
        for y in range(len(l_tics)):
            l_data += ['%s %s %s\n' %(x+1,y,1,)]
        l_data += ['%s %s %s\n' %(x+1,y+1,1,)]
        l_data += ['\n']
        gnuplot.contour_plot(
            prefix,l_data,
            title='%s %s' %(prefix,method,),zlabel='RMSD %s' %(method),
            d_xtics = d_tics, d_ytics = d_tics,
            palette = '0 1 0 0, 0.9999 0 0 1, 0.9999 1 1 1, 1 1 1 1',
            z1 = z1, z2 = z2+0.1,
            bool_remove = False,
            )
        os.system('convert %s.ps %s_spacegroup%s_mutations%s_atoms%s.png' %(prefix,prefix,spacegroup.replace(' ',''),n_mutations_max,method,))
##        os.remove('%s.ps' %(prefix,))

    print d_spacegroup
    print d_starting_model

    print r1
    print r2
    print r3

    return
Esempio n. 6
0
def singlevcf_vs_multiplevcfs():

    '''count variant call differences between mpileup and unifiedgenotyper'''

    fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf'
    fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf'

    fd2 = open(fp2,'r')

    ##
    ## parse first line of second file
    ##
    for line2 in fd2:
        l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
        if bool_continue == True:
            continue
        break

##    count1 = 0
##    count2 = 0
##    count_intersect = 0
##    count_unique1 = 0
##    count_unique2 = 0
##    for chromosome in [str(i) for i in range(1,22+1,)]+['X','Y',]:
##        print chromosome
##        l_pos2 = [POS2]
##        for line2 in fd2:
##            l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
##            if bool_continue == True: continue
##            if CHROM2 != chromosome: break
##            l_pos2 += [POS2]
##        l_pos1 = []
##        fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r')    
##        for line2 in fd1:
##            l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
##            if bool_continue == True: continue
##            l_pos1 += [POS2]
##        count1 += len(l_pos1)
##        count2 += len(l_pos2)
##        set1 = set(l_pos1)
##        del l_pos1
##        set2 = set(l_pos2)
##        del l_pos2
##        count_intersect += len(set1&set2)
##        count_unique1 += len(set1-set2)
##        count_unique2 += len(set2-set1)
##        del set1
##        del set2
##    print count1
##    print count2
##    print count_intersect
##    print count_unique1, count1-count_intersect
##    print count_unique2, count2-count_intersect
##    stop

    d_QUAL = {}
    count_intersect = 0
    count1 = 0
    count2 = 1
    l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',]
    for chromosome in l_chromosomes:
        d_QUAL[chromosome] = []

        fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r')
        for line1 in fd1:
            l1, CHROM1, POS1, REF1, ALT1, FILTER1, bool_continue = parse_line_vcf(line1)
            if bool_continue == True:
                continue

            count1 += 1

##            if CHROM1 == '2' or CHROM2 == '2':
##                break

            ## end of vcf2
            if CHROM2 == None:
                continue

            if CHROM1 != CHROM2:
                ## loop over lines2
                if l_chromosomes.index(CHROM1) > l_chromosomes.index(CHROM2):
                    for line2 in fd2:
                        l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                        if bool_continue == True:
                            continue
                        count2 += 1
                        if CHROM1 != CHROM2:
                            continue
    ##                    bool_chromosome_diff = True
                        break
                ## loop over lines1
                else:
    ##                bool_chromosome_diff = True
                    continue

            if POS1 == POS2:
                if l1[5] != '999' and l2[5] != '999':
                    d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)]
                count_intersect += 1
                for line2 in fd2:
                    l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                    if bool_continue == True:
                        continue
                    count2 += 1
                    break
                continue

            ## loop over lines1
            elif POS2 > POS1:
                if CHROM1 != CHROM2:
                    print 'b', CHROM1, CHROM2
                    stop
                continue

            ## loop over lines2
##            elif POS1 > POS2:
            else:
                for line2 in fd2:
                    l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                    if bool_continue == True:
                        continue
                    count2 += 1
                    if CHROM1 != CHROM2:
                        print 'c', CHROM1, CHROM2
                        stop
                    else:
                        ## loop over lines1
                        if POS2 > POS1:
                            break
                        elif POS1 == POS2:
                            if l1[5] != '999' and l2[5] != '999':
                                d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)]
                            count_intersect += 1
                            for line2 in fd2:
                                l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2)
                                if bool_continue == True:
                                    continue
                                count2 += 1
                                break
                            break
                        else:
                            continue

        fd1.close()

    fd2.close()

    print count_intersect
    print count1
    print count2

    fd = open('QUAL.gnuplotdata','w')
    for chromosome in d_QUAL.keys():
        print chromosome, len(d_QUAL[chromosome])
        fd.writelines(d_QUAL[chromosome])
    fd.close()
    gnuplot.scatter_plot_2d(
        'QUAL',regression=True,
        xlabel='QUAL UnifiedGenotyper',
        ylabel='QUAL mpileup',
        )

    l_QUAL1 = []
    l_QUAL2 = []
    for chromosome in d_QUAL.keys():
        for line in d_QUAL[chromosome]:
            l = line.split()
            QUAL1 = float(l[0])
            QUAL2 = float(l[1])
            l_QUAL1 += [QUAL1]
            l_QUAL2 += [QUAL2]
    instance = statistics.tests()
    r = instance.correlation(l_QUAL1,l_QUAL2,)
    print r

    stop

    return
        ##            print pdb, overlap, overlap_normalized, min_dist
        if overlap_normalized < 0.5 and min_dist > 30:
            print pdb, overlap, overlap_normalized, min_dist
        l_gnuplot += ['%s %s %s\n' % (
            min_dist,
            overlap_normalized,
            pdb,
        )]

print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True) / float(
    len(l_vicinal))

##print minmindist

prefix = 'gnuplot'

fd = open('%s.gnuplotdata' % (prefix), 'w')
fd.writelines(l_gnuplot)
fd.close()

gnuplot.scatter_plot_2d(
    prefix,
    ##    bool_regression_linear = True,
    xlabel='minimum distance to catalytic site residue(s)',
    ylabel='overlap between apo and holo eigenvectors',
    bool_remove=False,
)

print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True) / float(
    len(l_vicinal))
Esempio n. 8
0
def plot_MDS(d_options):

    l_cmds = []

    bfile_in = d_options['bfile']
    bfile_out = os.path.split(bfile_in)[1]

    if not os.path.isfile('%s.mds' %(bfile_out)):
        return

    ## count number of samples
    n_samples = int(os.popen('cat %s.fam | wc -l' %(bfile_in)).read())
    if d_options['remove'] != None:
        execmd('cat %s | sort -k1,1 > remove.sorted' %(d_options['remove']))
        execmd('cat %s.fam | sort -k1,1 > fam.sorted' %(d_options['bfile']))
        execmd('join fam.sorted remove.sorted > %s.fam.joined' %(bfile_out))
        n_samples -= int(os.popen('cat %s.fam.joined | wc -l' %(bfile_out)).read())
        os.remove('remove.sorted')
        os.remove('fam.sorted')

    ## sort
    execmd('cat %s.mds | awk \'NR>1\' | sort -k1,1 > %s.mds.sorted' %(
        bfile_out,bfile_out,))
    execmd('sort -k1,1 samples2pops.dic > samples2pops.dic.sorted')
    ## join samples
    cmd = 'join -a1 -e "Unknown" -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2'
    cmd += ' %s.mds.sorted samples2pops.dic.sorted' %(bfile_out)
    cmd += ' > %s.mds.joined' %(bfile_out)
    execmd(cmd)
    if (
        int(os.popen('cat %s.mds.joined | wc -l' %(bfile_out)).read())+1
        !=
        int(os.popen('cat %s.mds | wc -l' %(bfile_out)).read())
        ):
        print int(os.popen('cat %s.mds.joined | wc -l' %(bfile_out)).read())+1
        print int(os.popen('cat %s.mds | wc -l' %(bfile_out)).read())
        stop
    ##
    cmd = 'cat %s.mds.joined' %(bfile_out)
    cmd += " | awk '{"
    cmd += 'print $1,$2,$3,$4,$5,$6,$7 > $8".mds"'
    cmd += "}'"
    execmd(cmd)
    l_pops = os.popen(
        "cat %s.mds.joined | awk '{print $8}' | sort -u" %(bfile_out)
        ).read().strip().split('\n')
    
    ## define colors for set of populations
    l_colors = [
        [255,0,0,],
        [255,85,0,],
        [255,170,0,],
        [255,255,0,],
        [170,255,0,],
        [85,255,0,],
        [0,255,0,],
        [0,255,85,],
        [0,255,170,],
        [0,255,255,],
        [0,170,255,],
        [0,85,255,],
        [0,0,255,],
        [85,0,255,],
        [170,0,255,],
        [255,0,255,],
        [255,0,170,],
        [255,0,85,],
        [0,0,0,],
        [85,85,85,],
        [170,170,170,],
        ]

    l_pts = [5,7,9,11]

    line_plot = 'set key out\n'
    line_plot += 'plot '
    for i in xrange(len(l_pops)):
        pop = l_pops[i]
        color = "".join(map(chr, l_colors[i%len(l_colors)])).encode('hex')
        pt = l_pts[i%len(l_pts)]
        line_plot += '"%s.mds" u 4:5 pt %i ps 2 lc rgb "#%s" t "%s", ' %(
            pop,pt,color,pop)
    line_plot = line_plot[:-2]

    c1 = 1
    c2 = 2
    gnuplot.scatter_plot_2d(
        '%s.mds' %(bfile_out),
        line_plot = line_plot,
        xlabel = 'C%i' %(c1),
        ylabel = 'C%i' %(c2),
        title='%s' %(bfile_out),
        prefix_out='mds.2D.%s.%i.%i' %(bfile_out,c1,c2),
        bool_execute = False,
        bool_remove = False,
        )

    return
Esempio n. 9
0
    def main(self,):

        instance_GATK = GATK_pipeline.main()
        d_chromosome_lengths = instance_GATK.parse_chromosome_ranges()

        l_fn = os.listdir('stdout')

##        for fn in l_fn:
##            print fn
##            if (
##                fn[:len('UnifiedGenotyper')] == 'UnifiedGenotyper'
##                and
##                fn[len('UnifiedGenotyper')] != '.'
##                ):
##                old = os.path.join('stdout',fn)
##                new = os.path.join('stdout',fn.replace('UnifiedGenotyper','UnifiedGenotyper.'))
##                os.rename(old,new)
##            if '99' in fn:
##                fn_new = '.'.join([fn.split('.')[0],fn.split('.')[1],fn.split('.')[3],fn.split('.')[2],])
##                old = os.path.join('stdout',fn)
##                new = os.path.join('stdout',fn.replace(fn_new,''))
##                print old
##                print new
##                stop
##                os.rename(old,new)
##                continue
##            if (
##                fn[:len('UnifiedGenotyper')] == 'UnifiedGenotyper'
##                and
##                fn[-1] != 't'
##                ):
##                old = os.path.join('stdout',fn)
##                fn_new = '.'.join([fn.split('.')[0],fn.split('.')[1],fn.split('.')[3],fn.split('.')[2],])
##                new = os.path.join('stdout',fn_new)
##                os.rename(old,new)
##        stop

        d_resources = {'CPU':{},'Memory':{},}
        l_fn.sort()
        for fn in l_fn:
            if os.path.isdir(os.path.join('stdout',fn)):
                continue
##            print fn
            index1 = fn.index('.')
            step = fn[:index1]
            if '.' in fn[index1+1:]:
                index2 = index1+fn[index1+1:].index('.')+1
                chromosome = fn[index1+1:index2]
            else:
                chromosome = ''

            if chromosome == '23': chromosome = 'X'
            if chromosome == '24': chromosome = 'Y'

            fd = open('stdout/%s' %(fn),'r')
            lines = fd.readlines()
            fd.close()

            ## it would be faster to do rindex instead of regex,
            ## but in a few cases rubbish was appended to the farm log files
            keyword1 = re.compile(r'    Max Memory :')
            keyword2 = re.compile(r'    CPU time   :')
            l_mem = []
            l_cpu = []
            for line in lines:
                result1 = keyword1.search(line)
                result2 = keyword2.search(line)
                if result1 or result2:
                    v = float(line.split(':')[1].replace('sec.','').replace('MB',''))
                    if result1: l_mem += [v]
                    else: l_cpu += [v]

            cpu = max(l_cpu)
            mem = l_mem[l_cpu.index(cpu)]
##            if 'ApplyRecalibration' in fn and cpu > 1:
            if 'VariantRecalibrator' in fn and cpu > 1:
                print '%4i %4i %s' %(int(mem), int(cpu), chromosome), fn

            ## ignore if took less than a minute
            if cpu < 60:
                if os.path.getsize(os.path.join('stdout',fn)) < 2200:
                    print os.path.getsize(os.path.join('stdout',fn)), fn
                    stop
                    os.remove(os.path.join('stdout',fn))
                    continue
                continue

            for k_resource, v_resource in [
                ['CPU',cpu,],
                ['Memory',mem,],
                ]:
                if not step in d_resources[k_resource].keys():
                    d_resources[k_resource][step] = {}
                if not chromosome in d_resources[k_resource][step].keys():
                    d_resources[k_resource][step][chromosome] = []
                elif step not in ['UnifiedGenotyper','IMPUTE2',]:
                    print step, chromosome, k_resource, v_resource
                d_resources[k_resource][step][chromosome] += [v_resource]

        for k_resource in d_resources.keys():
            for step in d_resources[k_resource].keys():
                if 'Downsample' in step: continue
                if 'samtools' in step: continue
                l_y = []
                for chromosome in d_resources[k_resource][step].keys():
                    y = usage = d_resources[k_resource][step][chromosome]
                    if k_resource == 'CPU':
                        y = sum(y)/3600.
                    elif k_resource == 'Memory':
                        y = (sum(y)/len(y))
                    else:
                        print k_resource
                        stop
##                    lines += ['%s %s\n' %(x,y,)]
                    l_y += [y]
                if k_resource == 'CPU':
                    print k_resource, step, sum(l_y), len(l_y)
                else:
                    print k_resource, step, sum(l_y)/len(l_y), len(l_y)

        d_labels = {'Memory':'Mb','CPU':'hours'}
        for k_resource in d_resources.keys():
            for step in d_resources[k_resource].keys():
                l_x = []
                l_y = []
                if len(d_resources[k_resource][step].keys()) <= 3:
                    continue
                for chromosome in d_resources[k_resource][step].keys():
                    if chromosome == '':
                        continue
                    if chromosome[0] == '_':
                        continue
                    x = chromosome_length = d_chromosome_lengths[chromosome]/(10**6)
                    y = usage = d_resources[k_resource][step][chromosome]
                    if k_resource == 'CPU':
                        y = sum(y)/3600.
                    elif k_resource == 'Memory':
                        y = (sum(y)/len(y))
                    else:
                        print k_resource
                        stop
##                    lines += ['%s %s\n' %(x,y,)]
                    l_x += [x]
                    l_y += [y]
                if len(l_x) == 0:
                    print step
                    continue
##                fd = open('gnuplot_%s_%s.data' %(k_resource,step,),'w')
##                fd.writelines(lines)
##                fd.close()
                prefix = '%s_%s' %(k_resource,step,)
                print 'plotting', k_resource, step
                if k_resource == 'CPU':
                    print k_resource, step, sum(l_y), len(l_y)
                else:
                    print k_resource, step, sum(l_y)/len(l_y), len(l_y)
                gnuplot.scatter_plot_2d(
                    prefix,l1=l_x,l2=l_y,
                    ylabel='%s (%s)' %(k_resource,d_labels[k_resource]),
                    xlabel='chromosome length (Mbp)',
                    title=prefix.replace('_',' '),
                    )

        return
Esempio n. 10
0
    gnuplot.scatter_plot_2d(
        prefix,
        xlabel = 'RMSD_C_{/Symbol a} / @^{/Symbol \ \260}A', ## {\305} is Angstrom if iso encoding
        ylabel = '<{/Symbol Dc_1}> / {/Symbol \260}',
        xmin = 0, ymin = 0,
        xmax = xmax,
        ymax = ymax,

    ##    ylabel = 'heavy atom RMSD',
    ##    ymax = ymax,
    ##    function = 'x',

        key_vert_pos = 'left',
##        title = 'alpha carbon and heavy atom RMSD between %s wt structures' %(protein),
        title = title,

        bool_multiple_columns = True,
    ##    d_columns = d_columns,
        l_columns = l_columns,
        l_colors = l_colors,
        l_pointtypes = l_pointtypes,
        l_pointsizes = l_pointsizes,

        pointsize = 1,

##        bool_title = False,

        bool_remove = False,
        )
def MDS():

    import gnuplot

    prefix = 'Baganda29_quad29_octo200'
    prefix = 'Baganda29_quad29_octo200_excldiscordant'
    prefix = 'Baganda29_quad29_octo200_SNPQCtogether'
##    prefix = 'Ga-Adangbe'
##    prefix = 'Zulu'
##    prefix = 'Ga-Adangbeexcldiscordant'
##    prefix = 'Zuluexcldiscordant'

    bool_exclude_discordant = False
    if 'excldiscordant' in prefix:
        bool_exclude_discordant = True

    bool_merge = True
    if prefix in [
        'Baganda29_quad29_octo200_SNPQCtogether',
        'Ga-Adangbe','Zulu',
        'Ga-Adangbeexcldiscordant','Zuluexcldiscordant',
        ]:
        bool_merge = False
        if prefix == 'Baganda29_quad29_octo200_SNPQCtogether':
            bfile = 'pops/Baganda_quad29octo200/Baganda_quad29octo200.SNPQC'
        else:
            if 'excldiscordant' in prefix:
                bfile = 'pops/%s/%s.SNPQC' %(
                    prefix.replace('excldiscordant',''),
                    prefix.replace('excldiscordant',''),
                    )
            else:
                bfile = 'pops/%s/%s.SNPQC' %(prefix,prefix,)

    fn_ld_regions = 'pops/Baganda_octo/Baganda_octo.ldregions.SNPs'

    ##
    ## find common SNPs post QC
    ##
    if not os.path.isfile('%s.extract' %(prefix)):
        if bool_merge == False:
            cmd = 'cat %s.bim > %s.extract' %(bfile,prefix)
            execmd(cmd)
        else:
            cmd = "cat pops/Baganda_quad/Baganda_quad.SNPQC.bim | awk '{print $2}' | sort > Baganda_quad.SNPs"
            execmd(cmd)
            cmd = "cat pops/Baganda_octo/Baganda_octo.SNPQC.bim | awk '{print $2}' | sort > Baganda_octo.SNPs"
            execmd(cmd)
            cmd = 'comm -12 Baganda_quad.SNPs Baganda_octo.SNPs > %s.extract' %(prefix)
            execmd(cmd)
            os.remove('Baganda_quad.SNPs')
            os.remove('Baganda_octo.SNPs')

    ##
    ## find common samples post QC
    ##
    cmd = 'cat pops/Baganda_quad/Baganda_quad.SNPQC.fam | sort > Baganda_quad.fam'
    execmd(cmd)
    cmd = 'cat pops/Baganda_octo/Baganda_octo.SNPQC.fam | sort > Baganda_octo.fam'
    execmd(cmd)
    
    cmd = "cat Baganda_quad.fam | awk '{print substr($1,12,10)}' | sort > Baganda_quad.samples"
    execmd(cmd)
    cmd = "cat Baganda_octo.fam | awk '{print substr($1,12,10)}' | sort > Baganda_octo.samples"
    execmd(cmd)
    cmd = 'comm -12 Baganda_quad.samples Baganda_octo.samples > Baganda29.samples'
    execmd(cmd)
    os.remove('Baganda_quad.samples')
    os.remove('Baganda_octo.samples')
    cmd = 'fgrep -f Baganda29.samples Baganda_quad.fam | sort > Baganda29_quad29_octo0.fam'
    execmd(cmd)
    cmd = 'fgrep -f Baganda29.samples Baganda_octo.fam | sort > Baganda29_quad0_octo29.fam'
    execmd(cmd)
    os.remove('Baganda29.samples')

    cmd = 'comm -23 Baganda_quad.fam Baganda29_quad29_octo0.fam > Baganda29_quad71_octo0.fam'
    execmd(cmd)
    cmd = 'comm -23 Baganda_octo.fam Baganda29_quad0_octo29.fam > Baganda29_quad0_octo200.fam'
    execmd(cmd)
    cmd = 'cat Baganda29_quad0_octo200.fam Baganda29_quad29_octo0.fam > Baganda29_quad29_octo200.fam'
    execmd(cmd)
    cmd = 'cat Baganda29_quad0_octo200.fam Baganda29_quad29_octo0.fam > Baganda29_quad29_octo200_excldiscordant.fam'
    execmd(cmd)
    if bool_merge == False:
        cmd = 'cat %s.fam > %s.fam' %(bfile,prefix,)
        execmd(cmd)

    ##
    ## --bmerge
    ##
    if not os.path.isfile('%s.bed' %(prefix)):
        cmd = 'plink \\\n'
        if bool_merge == False:
            cmd += '--bfile %s \\\n' %(bfile)
        else:
            cmd += '--bfile pops/Baganda_quad/Baganda_quad.SNPQC \\\n'
            cmd += '--bmerge \\\n'
            cmd += 'pops/Baganda_octo/Baganda_octo.SNPQC.bed \\\n'
            cmd += 'pops/Baganda_octo/Baganda_octo.SNPQC.bim \\\n'
            cmd += 'pops/Baganda_octo/Baganda_octo.SNPQC.fam \\\n'
        cmd += '--keep %s.fam \\\n' %(prefix)
        cmd += '--extract %s.extract \\\n' %(prefix)
##        cmd += '--exclude 26diff_and_monomorphic.SNPs \\\n' ## tmp
        if bool_exclude_discordant == True:
            cmd += '--exclude discordant.SNPs \\\n' ## tmp
        cmd += '--make-bed --out %s \\\n' %(prefix)
        execmd(cmd)

    ##
    ## --indep-pairwise
    ##
    if not os.path.isfile('%s.prune.in' %(prefix)):
        cmd = 'plink \\\n'
        cmd += '--bfile %s \\\n' %(prefix)
        cmd += '--out %s \\\n' %(prefix)
        ## settings
        cmd += '--indep-pairwise 50 5 0.2 \\\n'
        cmd += '--maf 0.05 \\\n'
        ## SNP exclusion
        cmd += '--exclude %s \\\n' %(fn_ld_regions)
        execmd(cmd)

    ##
    ## --genome
    ##
    if not os.path.isfile('%s.genome' %(prefix)):    
        cmd = 'plink \\\n'
        cmd += '--bfile %s \\\n' %(prefix)
        cmd += '--out %s \\\n' %(prefix)
        cmd += '--genome \\\n'
        ## SNP exclusion
        cmd += '--extract %s.prune.in \\\n' %(prefix)
        cmd += '--exclude %s \\\n' %(fn_ld_regions)
        execmd(cmd)

    ##
    ## --cluster
    ##
    if not os.path.isfile('%s.mds' %(prefix)):
        cmd = 'plink \\\n'
        cmd += '--bfile %s \\\n' %(prefix)
        cmd += '--out %s \\\n' %(prefix)
        cmd += '--cluster \\\n'
        cmd += '--mds-plot 4 \\\n'
        cmd += '--read-genome %s.genome \\\n' %(prefix)
        ## SNP exclusion
        cmd += '--extract %s.prune.in \\\n' %(prefix)
        cmd += '--exclude %s \\\n' %(fn_ld_regions)
        execmd(cmd)

##    ##
##    ## EIGENSOFT
##    ##
##    eigensoft(prefix,fn_ld_regions,)

    ##
    ## plot
    ##
    if not os.path.isfile('%s.mds' %(prefix)):
        sys.exit(0)

    if bool_merge == True:
        cmd = "cat Baganda29_quad29_octo0.fam | awk '{print $1}' > Baganda29_quad29.samples"
        execmd(cmd)
        cmd = "cat Baganda29_quad0_octo29.fam | awk '{print $1}' > Baganda29_octo29.samples"
        execmd(cmd)
        for suffix in ['quad','octo',]:
            cmd = 'fgrep -f Baganda29_%s29.samples %s.mds' %(suffix,prefix,)
            cmd += " | awk '{print substr($1,12,10),$4,$5}'"
            cmd += ' | sort -k1,1'
            cmd += ' > %s_%s29.mds' %(prefix,suffix)
            execmd(cmd)
            if suffix == 'quad':
                continue
            cmd = "cat pops/Baganda_%s/Baganda_%s.fam | awk '{print $1}' > Baganda29_%s.samples" %(
                suffix,suffix,suffix,)
            execmd(cmd)
            cmd = 'fgrep -f Baganda29_%s.samples %s.mds' %(suffix,prefix,)
            cmd += " | awk '{print substr($1,12,10),$4,$5}'"
            cmd += ' | sort -k1,1'
            cmd += ' > %s_%s.mds' %(prefix,suffix)
            execmd(cmd)

##    lines_extra = ['set key out\n']
##    fd = open('%s_quad29.mds' %(prefix))
##    lines4 = fd.readlines()
##    fd.close()
##    fd = open('%s_octo29.mds' %(prefix))
##    lines8 = fd.readlines()
##    fd.close()
##    for i in xrange(len(lines4)):
##        l4 = lines4[i].split()
##        l8 = lines8[i].split()
##        x4 = float(l4[1])
##        y4 = float(l4[2])
##        x8 = float(l8[1])
##        y8 = float(l8[2])
##        lines_extra += ['set arrow from %f,%f to %f,%f\n' %(x4,y4,x8,y8,)]

    n_samples = int(os.popen('cat %s.mds | wc -l' %(prefix)).read())-1
    ## without pruning
    n_SNPs = int(os.popen('cat %s.bim | wc -l' %(prefix)).read())
    ## with pruning
    n_SNPs = int(os.popen('cat %s.prune.in | wc -l' %(prefix)).read())

    if bool_merge == False:
        execmd("cat omni2.5-4_20120904_agv_gtu.fam | awk '{print $2}' > quad.samples")
        execmd("cat omni2.5-8_agv_20120910_gtu.fam | awk '{print $2}' > octo.samples")
        execmd('fgrep -w -f quad.samples %s.mds > %s_quad.mds' %(prefix,prefix,))
        execmd('fgrep -w -f octo.samples %s.mds > %s_octo.mds' %(prefix,prefix,))
        line_plot = 'plot '
        line_plot += '"%s_quad.mds" u 4:5 ps 2 pt 7 lc 1 t "quad",' %(prefix)
        line_plot += '"%s_octo.mds" u 4:5 ps 2 pt 7 lc rgb "#0000FF" t "octo",' %(prefix)
        line_plot = line_plot[:-1]
    else:
        line_plot = 'plot '
        line_plot += '"%s_quad29.mds" u 2:3 ps 2 pt 7 lc 1 t "quad",' %(prefix)
    ##    line_plot += '"%s_octo29.mds" u 2:3 ps 3 pt 7 lc 3 t "octo",' %(prefix)
        line_plot += '"%s_octo.mds" u 2:3 ps 2 pt 7 lc rgb "#0000FF" t "octo",' %(prefix)
        line_plot = line_plot[:-1]

    gnuplot.scatter_plot_2d(
        '%s.mds' %(prefix),
        line_plot = line_plot,
##        column1 = 2, column2 = 3,
        xlabel = 'C1',
        ylabel = 'C2',
        title='%s (n_{samples}=%i, n_{SNPs}=%i)' %(
            'Baganda29',n_samples,n_SNPs,
            ),
        prefix_out='%s.mds' %(prefix),
##        lines_extra=lines_extra,
        bool_remove=False,
        )

    return
        if pdb in ['2d4i_b','2d4k_n',]:
            res_no -= 200
        res_symbol1 = mutation[-2]
        res_symbol2 = mutation[-1]
        if pdb in d_mutants.keys():
            mutation = '%1s%i%1s' %(res_symbol2,res_no,res_symbol1,)
            d_ddG[mutation]['backward'] += [ddG]
        else:
            mutation = '%1s%i%1s' %(res_symbol1,res_no,res_symbol2,)
            d_ddG[mutation]['forward'] += [ddG]

l = []
for mutation in d_ddG.keys():
    for ddG_forward in d_ddG[mutation]['forward']:
        for ddG_backward in d_ddG[mutation]['backward']:
            l += ['%s %s\n' %(-ddG_backward,ddG_forward,)]

fd = open('UFFBAPS.gnuplotdata','w')
fd.writelines(l)
fd.close()

prefix = 'UFFBAPS'
xlabel = 'ddG backward'
ylabel = 'ddG forward'
gnuplot.scatter_plot_2d(
    prefix, xlabel=xlabel, ylabel=ylabel,
    bool_multiple_columns = False,
    function = 'x',
    bool_remove = False,
    )