def plot(pdb,): import sys sys.path.append('/home/people/tc/svn/tc_sandbox/misc/') import gnuplot prefix = pdb gnuplot.scatter_plot_2d( prefix,regression=True,xlabel='mode',ylabel='correlation',ymin=-1,ymax=1, ) return
def main(): for chromosome in range(1,22+1,)+['X','Y',]: fp_in = 'out_GATK/sep/ApplyRecalibration.recalibrated.filtered.%s.vcf' %(chromosome) fp_out = 'out_VCFtools/freq%s' %(chromosome) if os.path.isfile('%s.frq' %(fp_out)): continue s = 'bsub \ -M4000000 -R\'select[mem>4000] rusage[mem=4000]\' \ vcftools \ --vcf %s \ --freq \ --out %s \ ' %(fp_in, fp_out,) os.system(s) for chromosome in range(1,22+1,)+['X','Y',]: fp_out = 'out_VCFtools/freq%s.frq' %(chromosome) ## if chromosome <= 11: ## continue print fp_out fd = open(fp_out,'r') for line in fd: break l_pos = [] l_MAF = [] for line in fd: l = line.split() pos = float(l[1])/10**6 MAF = l[-1][2:] l_pos += [pos] l_MAF += [MAF] ## lines += ['%s %s\n' %(pos,MAF,)] ## fd = open('','w') ## fd.writelines(lines) ## fd.close() gnuplot.scatter_plot_2d( 'MAF%s' %(chromosome), l_pos, l_MAF, xlabel = 'pos (Mbp)', ylabel = 'MAF', ) return
for i in range(len(l_overlaps)): overlap = l_overlaps[i] overlap_normalized = (overlap-min(l_overlaps))/(1.-min(l_overlaps)) min_dist = l_dist_min[i] ## if overlap_normalized < 0.5 and min_dist > 10: ## print pdb, overlap, overlap_normalized, min_dist if overlap_normalized < 0.5 and min_dist > 30: print pdb, overlap, overlap_normalized, min_dist l_gnuplot += ['%s %s %s\n' %(min_dist,overlap_normalized,pdb,)] print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True)/float(len(l_vicinal)) ##print minmindist prefix = 'gnuplot' fd = open('%s.gnuplotdata' %(prefix),'w') fd.writelines(l_gnuplot) fd.close() gnuplot.scatter_plot_2d( prefix, ## bool_regression_linear = True, xlabel='minimum distance to catalytic site residue(s)', ylabel='overlap between apo and holo eigenvectors', bool_remove = False, ) print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True)/float(len(l_vicinal))
def plot(d_mmCIF_main,d_rmsds,): l_pdbs = d_rmsds.keys() l_pdbs.sort() l_temperature = [] l_ph = [] l_resolution = [] d_spacegroup = {} d_starting_model = {} l_correl_T = [[],[],] l_correl_pH = [[],[],] l_correl_resol_max = [[],[],] d_histo_pH = {} d_histo_T = {} d_histo_resol = {} for i1 in range(len(l_pdbs)-1): pdb1 = l_pdbs[i1] spacegroup1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_symmetry.space_group_name_H-M',) T1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_diffrn.ambient_temp',) pH1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_exptl_crystal_grow.pH',) starting_model1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.pdbx_starting_model',) resolution1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.ls_d_res_high',) for i2 in range(i1+1,len(l_pdbs)): pdb2 = l_pdbs[i2] spacegroup2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_symmetry.space_group_name_H-M',) T2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_diffrn.ambient_temp',) pH2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_exptl_crystal_grow.pH',) starting_model2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.pdbx_starting_model',) resolution2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.ls_d_res_high',) rmsd = d_rmsds[pdb1][pdb2] if rmsd > 1: print pdb1, pdb2, rmsd if T1 and T2: T_diff = abs(float(T2)-float(T1)) l_temperature += ['%s %s\n' %(T_diff,rmsd),] l_correl_T[0] += [T_diff] l_correl_T[1] += [rmsd] print T_diff, 10*round(T_diff/10.,0) if not 10*round(T_diff/10.,0) in d_histo_T.keys(): d_histo_T[10*round(T_diff/10.,0)] = 0 d_histo_T[10*round(T_diff/10.,0)] += 1 if pH1 and pH2: pH_diff = abs(float(pH2)-float(pH1)) l_ph += ['%s %s\n' %(pH_diff,rmsd),] l_correl_pH[0] += [pH_diff] l_correl_pH[1] += [rmsd] if not pH_diff in d_histo_pH.keys(): d_histo_pH[pH_diff] = 0 d_histo_pH[pH_diff] += 1 resolution_max = max(resolution1,resolution2,) l_resolution += ['%s %s\n' %(resolution_max,rmsd),] if resolution_max != 'N/A': l_correl_resol_max[0] += [float(resolution_max)] l_correl_resol_max[1] += [rmsd] if not round(float(resolution_max),0) in d_histo_resol.keys(): d_histo_resol[round(float(resolution_max),0)] = 0 d_histo_resol[round(float(resolution_max),0)] += 1 d_spacegroup = append_to_dictionary(d_spacegroup,spacegroup1,spacegroup2,rmsd,) d_starting_model = append_to_dictionary(d_starting_model,starting_model1,starting_model2,rmsd,) r1 = statistics.correlation(l_correl_T[0],l_correl_T[1],) r2 = statistics.correlation(l_correl_pH[0],l_correl_pH[1],) r3 = statistics.correlation(l_correl_resol_max[0],l_correl_resol_max[1],) ## ## plot histograms ## for prefix,d in [ ['deltapH',d_histo_pH,], ['deltaT',d_histo_T,], ['maxresolution',d_histo_resol,], ]: l = [] l_diffs = d.keys() l_diffs.sort() for diff in l_diffs: l += ['%s %s\n' %(diff,d[diff],)] fd = open('histo_%s.txt' %(prefix),'w') fd.writelines(l) fd.close() l = [ 'set terminal postscript eps enhanced color "Helvetica"\n', 'set output ""\n', 'set size 3,3\n', 'set style data histogram\n', 'set xtics rotate\n', 'set xlabel "%s\n' %(prefix), 'set ylabel "count\n', 'plot "histo_%s.txt" u 2:xtic(1) t ""\n' %(prefix) ] fd = open('tmp.txt','w') fd.writelines(l) fd.close() os.system('gnuplot tmp.txt') os.system('convert histo_%s.png' %(prefix)) ## ## plot rmsd as a function of each property (2d) ## for prefix,data,xlabel in [ ['pH',l_ph,'pH diff',], ['Temperature',l_temperature,'T diff',], ['resolution',l_resolution,'maximum resolution',], ]: prefix += method fd = open('%s.gnuplotdata' %(prefix),'w') fd.writelines(data) fd.close() gnuplot.scatter_plot_2d( prefix,xlabel=xlabel,ylabel='RMSD %s' %(method,), ## averages=True, regression=True, ) ## ## plot rmsd as a function of each property (contour) ## for d,prefix in [ [d_spacegroup,'spacegroup',], [d_starting_model,'startingmodel',], ]: d_tics = {} l_tics = d.keys() l_tics.sort() for i in range(len(l_tics)): d_tics[l_tics[i]] = i+.5 z1 = 9 z2 = 0 l_data = [] for x in range(len(l_tics)): k1 = l_tics[x] for y in range(len(l_tics)): k2 = l_tics[y] if not k2 in d[k1].keys(): average = 9 else: l_rmsds = d[k1][k2] average = sum(l_rmsds)/len(l_rmsds) if average < z1: z1 = average if average > z2: z2 = average l_data += ['%s %s %s\n' %(x,y,average,)] l_data += ['%s %s %s\n' %(x,y+1,1,)] l_data += ['\n'] for y in range(len(l_tics)): l_data += ['%s %s %s\n' %(x+1,y,1,)] l_data += ['%s %s %s\n' %(x+1,y+1,1,)] l_data += ['\n'] gnuplot.contour_plot( prefix,l_data, title='%s %s' %(prefix,method,),zlabel='RMSD %s' %(method), d_xtics = d_tics, d_ytics = d_tics, palette = '0 1 0 0, 0.9999 0 0 1, 0.9999 1 1 1, 1 1 1 1', z1 = z1, z2 = z2+0.1, bool_remove = False, ) os.system('convert %s_spacegroup%s_mutations%s_atoms%s.png' %(prefix,prefix,spacegroup.replace(' ',''),n_mutations_max,method,)) ## os.remove('' %(prefix,)) print d_spacegroup print d_starting_model print r1 print r2 print r3 return
def singlevcf_vs_multiplevcfs(): '''count variant call differences between mpileup and unifiedgenotyper''' fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf' fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf' fd2 = open(fp2,'r') ## ## parse first line of second file ## for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue break ## count1 = 0 ## count2 = 0 ## count_intersect = 0 ## count_unique1 = 0 ## count_unique2 = 0 ## for chromosome in [str(i) for i in range(1,22+1,)]+['X','Y',]: ## print chromosome ## l_pos2 = [POS2] ## for line2 in fd2: ## l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) ## if bool_continue == True: continue ## if CHROM2 != chromosome: break ## l_pos2 += [POS2] ## l_pos1 = [] ## fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r') ## for line2 in fd1: ## l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) ## if bool_continue == True: continue ## l_pos1 += [POS2] ## count1 += len(l_pos1) ## count2 += len(l_pos2) ## set1 = set(l_pos1) ## del l_pos1 ## set2 = set(l_pos2) ## del l_pos2 ## count_intersect += len(set1&set2) ## count_unique1 += len(set1-set2) ## count_unique2 += len(set2-set1) ## del set1 ## del set2 ## print count1 ## print count2 ## print count_intersect ## print count_unique1, count1-count_intersect ## print count_unique2, count2-count_intersect ## stop d_QUAL = {} count_intersect = 0 count1 = 0 count2 = 1 l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',] for chromosome in l_chromosomes: d_QUAL[chromosome] = [] fd1 = open('out_mp15/vqsr/%s.vqsr.filt.vcf' %(chromosome),'r') for line1 in fd1: l1, CHROM1, POS1, REF1, ALT1, FILTER1, bool_continue = parse_line_vcf(line1) if bool_continue == True: continue count1 += 1 ## if CHROM1 == '2' or CHROM2 == '2': ## break ## end of vcf2 if CHROM2 == None: continue if CHROM1 != CHROM2: ## loop over lines2 if l_chromosomes.index(CHROM1) > l_chromosomes.index(CHROM2): for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 if CHROM1 != CHROM2: continue ## bool_chromosome_diff = True break ## loop over lines1 else: ## bool_chromosome_diff = True continue if POS1 == POS2: if l1[5] != '999' and l2[5] != '999': d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)] count_intersect += 1 for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 break continue ## loop over lines1 elif POS2 > POS1: if CHROM1 != CHROM2: print 'b', CHROM1, CHROM2 stop continue ## loop over lines2 ## elif POS1 > POS2: else: for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 if CHROM1 != CHROM2: print 'c', CHROM1, CHROM2 stop else: ## loop over lines1 if POS2 > POS1: break elif POS1 == POS2: if l1[5] != '999' and l2[5] != '999': d_QUAL[CHROM1] += ['%s %s\n' %(l1[5],l2[5],)] count_intersect += 1 for line2 in fd2: l2, CHROM2, POS2, REF2, ALT2, FILTER2, bool_continue = parse_line_vcf(line2) if bool_continue == True: continue count2 += 1 break break else: continue fd1.close() fd2.close() print count_intersect print count1 print count2 fd = open('QUAL.gnuplotdata','w') for chromosome in d_QUAL.keys(): print chromosome, len(d_QUAL[chromosome]) fd.writelines(d_QUAL[chromosome]) fd.close() gnuplot.scatter_plot_2d( 'QUAL',regression=True, xlabel='QUAL UnifiedGenotyper', ylabel='QUAL mpileup', ) l_QUAL1 = [] l_QUAL2 = [] for chromosome in d_QUAL.keys(): for line in d_QUAL[chromosome]: l = line.split() QUAL1 = float(l[0]) QUAL2 = float(l[1]) l_QUAL1 += [QUAL1] l_QUAL2 += [QUAL2] instance = statistics.tests() r = instance.correlation(l_QUAL1,l_QUAL2,) print r stop return
for i in range(len(l_overlaps)):
    overlap = l_overlaps[i]
    overlap_normalized = (overlap-min(l_overlaps))/(1.-min(l_overlaps))
    min_dist = l_dist_min[i]
    ## if overlap_normalized < 0.5 and min_dist > 10:
    ##     print pdb, overlap, overlap_normalized, min_dist
    if overlap_normalized < 0.5 and min_dist > 30:
        print pdb, overlap, overlap_normalized, min_dist
        l_gnuplot += ['%s %s %s\n' %(min_dist,overlap_normalized,pdb,)]
print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True)/float(len(l_vicinal))
##print minmindist
prefix = 'gnuplot'
fd = open('%s.gnuplotdata' %(prefix),'w')
fd.writelines(l_gnuplot)
fd.close()
gnuplot.scatter_plot_2d(
    prefix,
    ## bool_regression_linear = True,
    xlabel='minimum distance to catalytic site residue(s)',
    ylabel='overlap between apo and holo eigenvectors',
    bool_remove = False,
    )
print l_vicinal.count(True), len(l_vicinal), l_vicinal.count(True)/float(len(l_vicinal))
def plot_MDS(d_options): l_cmds = [] bfile_in = d_options['bfile'] bfile_out = os.path.split(bfile_in)[1] if not os.path.isfile('%s.mds' %(bfile_out)): return ## count number of samples n_samples = int(os.popen('cat %s.fam | wc -l' %(bfile_in)).read()) if d_options['remove'] != None: execmd('cat %s | sort -k1,1 > remove.sorted' %(d_options['remove'])) execmd('cat %s.fam | sort -k1,1 > fam.sorted' %(d_options['bfile'])) execmd('join fam.sorted remove.sorted > %s.fam.joined' %(bfile_out)) n_samples -= int(os.popen('cat %s.fam.joined | wc -l' %(bfile_out)).read()) os.remove('remove.sorted') os.remove('fam.sorted') ## sort execmd('cat %s.mds | awk \'NR>1\' | sort -k1,1 > %s.mds.sorted' %( bfile_out,bfile_out,)) execmd('sort -k1,1 samples2pops.dic > samples2pops.dic.sorted') ## join samples cmd = 'join -a1 -e "Unknown" -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2' cmd += ' %s.mds.sorted samples2pops.dic.sorted' %(bfile_out) cmd += ' > %s.mds.joined' %(bfile_out) execmd(cmd) if ( int(os.popen('cat %s.mds.joined | wc -l' %(bfile_out)).read())+1 != int(os.popen('cat %s.mds | wc -l' %(bfile_out)).read()) ): print int(os.popen('cat %s.mds.joined | wc -l' %(bfile_out)).read())+1 print int(os.popen('cat %s.mds | wc -l' %(bfile_out)).read()) stop ## cmd = 'cat %s.mds.joined' %(bfile_out) cmd += " | awk '{" cmd += 'print $1,$2,$3,$4,$5,$6,$7 > $8".mds"' cmd += "}'" execmd(cmd) l_pops = os.popen( "cat %s.mds.joined | awk '{print $8}' | sort -u" %(bfile_out) ).read().strip().split('\n') ## define colors for set of populations l_colors = [ [255,0,0,], [255,85,0,], [255,170,0,], [255,255,0,], [170,255,0,], [85,255,0,], [0,255,0,], [0,255,85,], [0,255,170,], [0,255,255,], [0,170,255,], [0,85,255,], [0,0,255,], [85,0,255,], [170,0,255,], [255,0,255,], [255,0,170,], [255,0,85,], [0,0,0,], [85,85,85,], [170,170,170,], ] l_pts = [5,7,9,11] line_plot = 'set key out\n' line_plot += 'plot ' for i in xrange(len(l_pops)): pop = l_pops[i] color = "".join(map(chr, l_colors[i%len(l_colors)])).encode('hex') pt = l_pts[i%len(l_pts)] line_plot += '"%s.mds" u 4:5 pt %i ps 2 lc rgb "#%s" t "%s", ' %( pop,pt,color,pop) line_plot = line_plot[:-2] c1 = 1 c2 = 2 gnuplot.scatter_plot_2d( '%s.mds' %(bfile_out), line_plot = line_plot, xlabel = 'C%i' %(c1), ylabel = 'C%i' %(c2), title='%s' %(bfile_out), prefix_out='mds.2D.%s.%i.%i' %(bfile_out,c1,c2), bool_execute = False, bool_remove = False, ) return
def main(self,): instance_GATK = GATK_pipeline.main() d_chromosome_lengths = instance_GATK.parse_chromosome_ranges() l_fn = os.listdir('stdout') ## for fn in l_fn: ## print fn ## if ( ## fn[:len('UnifiedGenotyper')] == 'UnifiedGenotyper' ## and ## fn[len('UnifiedGenotyper')] != '.' ## ): ## old = os.path.join('stdout',fn) ## new = os.path.join('stdout',fn.replace('UnifiedGenotyper','UnifiedGenotyper.')) ## os.rename(old,new) ## if '99' in fn: ## fn_new = '.'.join([fn.split('.')[0],fn.split('.')[1],fn.split('.')[3],fn.split('.')[2],]) ## old = os.path.join('stdout',fn) ## new = os.path.join('stdout',fn.replace(fn_new,'')) ## print old ## print new ## stop ## os.rename(old,new) ## continue ## if ( ## fn[:len('UnifiedGenotyper')] == 'UnifiedGenotyper' ## and ## fn[-1] != 't' ## ): ## old = os.path.join('stdout',fn) ## fn_new = '.'.join([fn.split('.')[0],fn.split('.')[1],fn.split('.')[3],fn.split('.')[2],]) ## new = os.path.join('stdout',fn_new) ## os.rename(old,new) ## stop d_resources = {'CPU':{},'Memory':{},} l_fn.sort() for fn in l_fn: if os.path.isdir(os.path.join('stdout',fn)): continue ## print fn index1 = fn.index('.') step = fn[:index1] if '.' in fn[index1+1:]: index2 = index1+fn[index1+1:].index('.')+1 chromosome = fn[index1+1:index2] else: chromosome = '' if chromosome == '23': chromosome = 'X' if chromosome == '24': chromosome = 'Y' fd = open('stdout/%s' %(fn),'r') lines = fd.readlines() fd.close() ## it would be faster to do rindex instead of regex, ## but in a few cases rubbish was appended to the farm log files keyword1 = re.compile(r' Max Memory :') keyword2 = re.compile(r' CPU time :') l_mem = [] l_cpu = [] for line in lines: result1 = result2 = if result1 or result2: v = float(line.split(':')[1].replace('sec.','').replace('MB','')) if result1: l_mem += [v] else: l_cpu += [v] cpu = max(l_cpu) mem = l_mem[l_cpu.index(cpu)] ## if 'ApplyRecalibration' in fn and cpu > 1: if 'VariantRecalibrator' in fn and cpu > 1: print '%4i %4i %s' %(int(mem), int(cpu), chromosome), fn ## ignore if took less than a minute if cpu < 60: if os.path.getsize(os.path.join('stdout',fn)) < 2200: print os.path.getsize(os.path.join('stdout',fn)), fn stop os.remove(os.path.join('stdout',fn)) continue continue for k_resource, v_resource in [ ['CPU',cpu,], ['Memory',mem,], ]: if not step in d_resources[k_resource].keys(): d_resources[k_resource][step] = {} if not chromosome in d_resources[k_resource][step].keys(): d_resources[k_resource][step][chromosome] = [] elif step not in ['UnifiedGenotyper','IMPUTE2',]: print step, chromosome, k_resource, v_resource d_resources[k_resource][step][chromosome] += [v_resource] for k_resource in d_resources.keys(): for step in d_resources[k_resource].keys(): if 'Downsample' in step: continue if 'samtools' in step: continue l_y = [] for chromosome in d_resources[k_resource][step].keys(): y = usage = d_resources[k_resource][step][chromosome] if k_resource == 'CPU': y = sum(y)/3600. elif k_resource == 'Memory': y = (sum(y)/len(y)) else: print k_resource stop ## lines += ['%s %s\n' %(x,y,)] l_y += [y] if k_resource == 'CPU': print k_resource, step, sum(l_y), len(l_y) else: print k_resource, step, sum(l_y)/len(l_y), len(l_y) d_labels = {'Memory':'Mb','CPU':'hours'} for k_resource in d_resources.keys(): for step in d_resources[k_resource].keys(): l_x = [] l_y = [] if len(d_resources[k_resource][step].keys()) <= 3: continue for chromosome in d_resources[k_resource][step].keys(): if chromosome == '': continue if chromosome[0] == '_': continue x = chromosome_length = d_chromosome_lengths[chromosome]/(10**6) y = usage = d_resources[k_resource][step][chromosome] if k_resource == 'CPU': y = sum(y)/3600. elif k_resource == 'Memory': y = (sum(y)/len(y)) else: print k_resource stop ## lines += ['%s %s\n' %(x,y,)] l_x += [x] l_y += [y] if len(l_x) == 0: print step continue ## fd = open('' %(k_resource,step,),'w') ## fd.writelines(lines) ## fd.close() prefix = '%s_%s' %(k_resource,step,) print 'plotting', k_resource, step if k_resource == 'CPU': print k_resource, step, sum(l_y), len(l_y) else: print k_resource, step, sum(l_y)/len(l_y), len(l_y) gnuplot.scatter_plot_2d( prefix,l1=l_x,l2=l_y, ylabel='%s (%s)' %(k_resource,d_labels[k_resource]), xlabel='chromosome length (Mbp)', title=prefix.replace('_',' '), ) return
gnuplot.scatter_plot_2d( prefix, xlabel = 'RMSD_C_{/Symbol a} / @^{/Symbol \ \260}A', ## {\305} is Angstrom if iso encoding ylabel = '<{/Symbol Dc_1}> / {/Symbol \260}', xmin = 0, ymin = 0, xmax = xmax, ymax = ymax, ## ylabel = 'heavy atom RMSD', ## ymax = ymax, ## function = 'x', key_vert_pos = 'left', ## title = 'alpha carbon and heavy atom RMSD between %s wt structures' %(protein), title = title, bool_multiple_columns = True, ## d_columns = d_columns, l_columns = l_columns, l_colors = l_colors, l_pointtypes = l_pointtypes, l_pointsizes = l_pointsizes, pointsize = 1, ## bool_title = False, bool_remove = False, )
def MDS(): import gnuplot prefix = 'Baganda29_quad29_octo200' prefix = 'Baganda29_quad29_octo200_excldiscordant' prefix = 'Baganda29_quad29_octo200_SNPQCtogether' ## prefix = 'Ga-Adangbe' ## prefix = 'Zulu' ## prefix = 'Ga-Adangbeexcldiscordant' ## prefix = 'Zuluexcldiscordant' bool_exclude_discordant = False if 'excldiscordant' in prefix: bool_exclude_discordant = True bool_merge = True if prefix in [ 'Baganda29_quad29_octo200_SNPQCtogether', 'Ga-Adangbe','Zulu', 'Ga-Adangbeexcldiscordant','Zuluexcldiscordant', ]: bool_merge = False if prefix == 'Baganda29_quad29_octo200_SNPQCtogether': bfile = 'pops/Baganda_quad29octo200/Baganda_quad29octo200.SNPQC' else: if 'excldiscordant' in prefix: bfile = 'pops/%s/%s.SNPQC' %( prefix.replace('excldiscordant',''), prefix.replace('excldiscordant',''), ) else: bfile = 'pops/%s/%s.SNPQC' %(prefix,prefix,) fn_ld_regions = 'pops/Baganda_octo/Baganda_octo.ldregions.SNPs' ## ## find common SNPs post QC ## if not os.path.isfile('%s.extract' %(prefix)): if bool_merge == False: cmd = 'cat %s.bim > %s.extract' %(bfile,prefix) execmd(cmd) else: cmd = "cat pops/Baganda_quad/Baganda_quad.SNPQC.bim | awk '{print $2}' | sort > Baganda_quad.SNPs" execmd(cmd) cmd = "cat pops/Baganda_octo/Baganda_octo.SNPQC.bim | awk '{print $2}' | sort > Baganda_octo.SNPs" execmd(cmd) cmd = 'comm -12 Baganda_quad.SNPs Baganda_octo.SNPs > %s.extract' %(prefix) execmd(cmd) os.remove('Baganda_quad.SNPs') os.remove('Baganda_octo.SNPs') ## ## find common samples post QC ## cmd = 'cat pops/Baganda_quad/Baganda_quad.SNPQC.fam | sort > Baganda_quad.fam' execmd(cmd) cmd = 'cat pops/Baganda_octo/Baganda_octo.SNPQC.fam | sort > Baganda_octo.fam' execmd(cmd) cmd = "cat Baganda_quad.fam | awk '{print substr($1,12,10)}' | sort > Baganda_quad.samples" execmd(cmd) cmd = "cat Baganda_octo.fam | awk '{print substr($1,12,10)}' | sort > Baganda_octo.samples" execmd(cmd) cmd = 'comm -12 Baganda_quad.samples Baganda_octo.samples > Baganda29.samples' execmd(cmd) os.remove('Baganda_quad.samples') os.remove('Baganda_octo.samples') cmd = 'fgrep -f Baganda29.samples Baganda_quad.fam | sort > Baganda29_quad29_octo0.fam' execmd(cmd) cmd = 'fgrep -f Baganda29.samples Baganda_octo.fam | sort > Baganda29_quad0_octo29.fam' execmd(cmd) os.remove('Baganda29.samples') cmd = 'comm -23 Baganda_quad.fam Baganda29_quad29_octo0.fam > Baganda29_quad71_octo0.fam' execmd(cmd) cmd = 'comm -23 Baganda_octo.fam Baganda29_quad0_octo29.fam > Baganda29_quad0_octo200.fam' execmd(cmd) cmd = 'cat Baganda29_quad0_octo200.fam Baganda29_quad29_octo0.fam > Baganda29_quad29_octo200.fam' execmd(cmd) cmd = 'cat Baganda29_quad0_octo200.fam Baganda29_quad29_octo0.fam > Baganda29_quad29_octo200_excldiscordant.fam' execmd(cmd) if bool_merge == False: cmd = 'cat %s.fam > %s.fam' %(bfile,prefix,) execmd(cmd) ## ## --bmerge ## if not os.path.isfile('%s.bed' %(prefix)): cmd = 'plink \\\n' if bool_merge == False: cmd += '--bfile %s \\\n' %(bfile) else: cmd += '--bfile pops/Baganda_quad/Baganda_quad.SNPQC \\\n' cmd += '--bmerge \\\n' cmd += 'pops/Baganda_octo/Baganda_octo.SNPQC.bed \\\n' cmd += 'pops/Baganda_octo/Baganda_octo.SNPQC.bim \\\n' cmd += 'pops/Baganda_octo/Baganda_octo.SNPQC.fam \\\n' cmd += '--keep %s.fam \\\n' %(prefix) cmd += '--extract %s.extract \\\n' %(prefix) ## cmd += '--exclude 26diff_and_monomorphic.SNPs \\\n' ## tmp if bool_exclude_discordant == True: cmd += '--exclude discordant.SNPs \\\n' ## tmp cmd += '--make-bed --out %s \\\n' %(prefix) execmd(cmd) ## ## --indep-pairwise ## if not os.path.isfile('' %(prefix)): cmd = 'plink \\\n' cmd += '--bfile %s \\\n' %(prefix) cmd += '--out %s \\\n' %(prefix) ## settings cmd += '--indep-pairwise 50 5 0.2 \\\n' cmd += '--maf 0.05 \\\n' ## SNP exclusion cmd += '--exclude %s \\\n' %(fn_ld_regions) execmd(cmd) ## ## --genome ## if not os.path.isfile('%s.genome' %(prefix)): cmd = 'plink \\\n' cmd += '--bfile %s \\\n' %(prefix) cmd += '--out %s \\\n' %(prefix) cmd += '--genome \\\n' ## SNP exclusion cmd += '--extract \\\n' %(prefix) cmd += '--exclude %s \\\n' %(fn_ld_regions) execmd(cmd) ## ## --cluster ## if not os.path.isfile('%s.mds' %(prefix)): cmd = 'plink \\\n' cmd += '--bfile %s \\\n' %(prefix) cmd += '--out %s \\\n' %(prefix) cmd += '--cluster \\\n' cmd += '--mds-plot 4 \\\n' cmd += '--read-genome %s.genome \\\n' %(prefix) ## SNP exclusion cmd += '--extract \\\n' %(prefix) cmd += '--exclude %s \\\n' %(fn_ld_regions) execmd(cmd) ## ## ## ## EIGENSOFT ## ## ## eigensoft(prefix,fn_ld_regions,) ## ## plot ## if not os.path.isfile('%s.mds' %(prefix)): sys.exit(0) if bool_merge == True: cmd = "cat Baganda29_quad29_octo0.fam | awk '{print $1}' > Baganda29_quad29.samples" execmd(cmd) cmd = "cat Baganda29_quad0_octo29.fam | awk '{print $1}' > Baganda29_octo29.samples" execmd(cmd) for suffix in ['quad','octo',]: cmd = 'fgrep -f Baganda29_%s29.samples %s.mds' %(suffix,prefix,) cmd += " | awk '{print substr($1,12,10),$4,$5}'" cmd += ' | sort -k1,1' cmd += ' > %s_%s29.mds' %(prefix,suffix) execmd(cmd) if suffix == 'quad': continue cmd = "cat pops/Baganda_%s/Baganda_%s.fam | awk '{print $1}' > Baganda29_%s.samples" %( suffix,suffix,suffix,) execmd(cmd) cmd = 'fgrep -f Baganda29_%s.samples %s.mds' %(suffix,prefix,) cmd += " | awk '{print substr($1,12,10),$4,$5}'" cmd += ' | sort -k1,1' cmd += ' > %s_%s.mds' %(prefix,suffix) execmd(cmd) ## lines_extra = ['set key out\n'] ## fd = open('%s_quad29.mds' %(prefix)) ## lines4 = fd.readlines() ## fd.close() ## fd = open('%s_octo29.mds' %(prefix)) ## lines8 = fd.readlines() ## fd.close() ## for i in xrange(len(lines4)): ## l4 = lines4[i].split() ## l8 = lines8[i].split() ## x4 = float(l4[1]) ## y4 = float(l4[2]) ## x8 = float(l8[1]) ## y8 = float(l8[2]) ## lines_extra += ['set arrow from %f,%f to %f,%f\n' %(x4,y4,x8,y8,)] n_samples = int(os.popen('cat %s.mds | wc -l' %(prefix)).read())-1 ## without pruning n_SNPs = int(os.popen('cat %s.bim | wc -l' %(prefix)).read()) ## with pruning n_SNPs = int(os.popen('cat | wc -l' %(prefix)).read()) if bool_merge == False: execmd("cat omni2.5-4_20120904_agv_gtu.fam | awk '{print $2}' > quad.samples") execmd("cat omni2.5-8_agv_20120910_gtu.fam | awk '{print $2}' > octo.samples") execmd('fgrep -w -f quad.samples %s.mds > %s_quad.mds' %(prefix,prefix,)) execmd('fgrep -w -f octo.samples %s.mds > %s_octo.mds' %(prefix,prefix,)) line_plot = 'plot ' line_plot += '"%s_quad.mds" u 4:5 ps 2 pt 7 lc 1 t "quad",' %(prefix) line_plot += '"%s_octo.mds" u 4:5 ps 2 pt 7 lc rgb "#0000FF" t "octo",' %(prefix) line_plot = line_plot[:-1] else: line_plot = 'plot ' line_plot += '"%s_quad29.mds" u 2:3 ps 2 pt 7 lc 1 t "quad",' %(prefix) ## line_plot += '"%s_octo29.mds" u 2:3 ps 3 pt 7 lc 3 t "octo",' %(prefix) line_plot += '"%s_octo.mds" u 2:3 ps 2 pt 7 lc rgb "#0000FF" t "octo",' %(prefix) line_plot = line_plot[:-1] gnuplot.scatter_plot_2d( '%s.mds' %(prefix), line_plot = line_plot, ## column1 = 2, column2 = 3, xlabel = 'C1', ylabel = 'C2', title='%s (n_{samples}=%i, n_{SNPs}=%i)' %( 'Baganda29',n_samples,n_SNPs, ), prefix_out='%s.mds' %(prefix), ## lines_extra=lines_extra, bool_remove=False, ) return
if pdb in ['2d4i_b','2d4k_n',]: res_no -= 200 res_symbol1 = mutation[-2] res_symbol2 = mutation[-1] if pdb in d_mutants.keys(): mutation = '%1s%i%1s' %(res_symbol2,res_no,res_symbol1,) d_ddG[mutation]['backward'] += [ddG] else: mutation = '%1s%i%1s' %(res_symbol1,res_no,res_symbol2,) d_ddG[mutation]['forward'] += [ddG] l = [] for mutation in d_ddG.keys(): for ddG_forward in d_ddG[mutation]['forward']: for ddG_backward in d_ddG[mutation]['backward']: l += ['%s %s\n' %(-ddG_backward,ddG_forward,)] fd = open('UFFBAPS.gnuplotdata','w') fd.writelines(l) fd.close() prefix = 'UFFBAPS' xlabel = 'ddG backward' ylabel = 'ddG forward' gnuplot.scatter_plot_2d( prefix, xlabel=xlabel, ylabel=ylabel, bool_multiple_columns = False, function = 'x', bool_remove = False, )