def get_out_search(): with open(sys.argv[-1]) as wdsp_f: CUTOFF = 20 wdsp = Wdsp(wdsp_f) pros,seqs,wdsps,hotspots = wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots best = {} pro_num = 1000000 while pro_num > CUTOFF: a = PatchSearchSpecific(pros,seqs,wdsps,hotspots,CUTOFF) a.get_patches() a.classify_patches() shape,patch,pro_list,pro_num = a.get_best() if not shape in best.keys(): best[shape] = {} best[shape][patch] = pro_list else: best[shape][patch] = pro_list for pro in pro_list: if pro in seqs.keys(): pros.pop(pros.index(pro)) seqs.pop(pro) wdsps.pop(pro) hotspots.pop(pro) with open(sys.argv[-1]) as wdsp_f: wdsp = Wdsp(wdsp_f) write_results(best,wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF)
def main(): with open(sys.argv[-2]) as wdsp_f: tem_wdsp = Wdsp(wdsp_f) tem_hots = tem_wdsp.hotspots with open(sys.argv[-1]) as wdsp_f: all_wdsp = Wdsp(wdsp_f) all_hots = all_wdsp.hotspots tem_all_hots = get_similar_hots(tem_hots, all_hots) write_result(tem_all_hots, tem_hots, all_hots)
def main(): with open(sys.argv[-1]) as wdsp_f: all_wdsp = Wdsp(wdsp_f) all_hots = all_wdsp.hotspots all_seqs = all_wdsp.seqs clusters = cluster_topface(all_hots) regressions = [] for pros in clusters: c_size = len(pros) if c_size > 10: filename = str(c_size) + '_' + pros[0] + '_' + str(cutoff) hots = [[pro, all_hots[pro]] for pro in pros] seqs = [[pro, all_wdsp.seqs[pro]] for pro in pros] hots_score = align_hots([hot[1] for hot in hots]) seqs_score = align_seqs([seq[1] for seq in seqs]) regressions.append(linregress(seqs_score, hots_score)) plot_scatter(seqs_score, hots_score, filename + '_scatter') hots = adjust_hots(hots) hots = [(pro, ''.join(hot)) for pro, hot in hots] plotlogo(hots, filename + '_logo') with open('regressions.txt', 'w') as w_f: 'slop,intercept,r-value,p-value,stderr' for r in regressions: print >> w_f, ';'.join(map(str, r))
def main(): with open('test.wdsp') as wdsp_f: wdsp = Wdsp(wdsp_f) a = PatchSearch(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,1) a.get_patches() a.classify_patches() a.write_results()
def main(): with open(sys.argv[1]) as wdsp_f: CUTOFF = 20 wdsp = Wdsp(wdsp_f) a = PatchSearch(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF) a.get_patches() a.classify_patches() write_results(a.shape_patch_pros,wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF)
def main(): with open(sys.argv[-2]) as wdsp_f: tem_wdsp = Wdsp(wdsp_f) tem_hots = tem_wdsp.hotspots with open(sys.argv[-1]) as wdsp_f: all_wdsp = Wdsp(wdsp_f) all_hots = all_wdsp.hotspots cutoff = 0.0 for cutoff in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: # for cutoff in [30,40,50,60,70,80,90]: tem_all_hots = get_similar_hots(tem_hots, all_hots, cutoff) for tem_pro, all_pros in tem_all_hots.iteritems(): hots = [[pro, all_hots[pro]] for pro, _ in all_pros] seqs = [[pro, all_wdsp.seqs[pro]] for pro, _ in all_pros] hots_score = get_hot_similarity(hots) seqs_score = get_seq_similarity(seqs) plot_scatter(seqs_score, hots_score, tem_pro + '_' + str(cutoff)) hots = adjust_hots(hots) hots_len = len(hots) hots = [(pro, ''.join(hot)) for pro, hot in hots] plotlogo(hots, str(hots_len) + '_' + tem_pro + '_' + str(cutoff)) f, ax = plt.subplots() fig = plt.figure(figsize=(5, 4)) ax = fig.add_subplot(111) # sns.distplot(hots_score,hist=False,label='Topface',kde_kws={'linestyle':'-.'}) # sns.distplot(seqs_score,hist=False,label='Sequence',kde_kws={'linestyle':'--'}) sns.distplot(hots_score, hist=False, label='Topface', kde_kws={'marker': ' '}) sns.distplot(seqs_score, hist=False, label='Sequence', kde_kws={'marker': '*'}) ax.set(xlabel='Similarity', ylabel='Frequency', title='WD40 Protein Topface and Sequence Similarity') # h.figure.subplots_adjust(top=0.9,bottom=0.05,left=0.18,right=0.98) plt.savefig(tem_pro + '_' + str(cutoff) + 'hot_seq_similarity_dist.png', dpi=300)
def main(): with open(sys.argv[1]) as wdsp_f: CUTOFF = 10 wdsp = Wdsp(wdsp_f) a = PatchSearchSpecific(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,CUTOFF) a.get_patches() a.classify_patches() # a.deredundant_patches() a.write_results()
def main(): with open(sys.argv[-1]) as o_f: w = Wdsp(o_f) sims = OrderedDict() for pro, repeats in w.repeats.iteritems(): sims[pro] = repeat_similarity(repeats) with open('sims.txt', 'w') as w_f: for k, v in sims.iteritems(): print >> w_f, '{0:<20}{1:<}'.format(k, v[0])
def main(): with open(sys.argv[-2]) as wdsp_f: tem_wdsp = Wdsp(wdsp_f) tem_hots = tem_wdsp.hotspots tem_repeats_similarity = wdsp_repeat_similarity(tem_wdsp.repeats) with open(sys.argv[-1]) as wdsp_f: all_wdsp = Wdsp(wdsp_f) all_hots = all_wdsp.hotspots all_repeats_similarity = wdsp_repeat_similarity(all_wdsp.repeats) tem_all_seq_similarity = seq_similarity(tem_wdsp, all_wdsp) cutoff = 70 # for cutoff in [30,40,50,60,70,80,90]: for cutoff in [10]: tem_all_hots = get_similar_hots(tem_hots, all_hots, cutoff) write_result(tem_all_hots, tem_hots, tem_wdsp, all_hots, all_wdsp, tem_repeats_similarity, all_repeats_similarity, tem_all_seq_similarity, cutoff)
def main(): with open(sys.argv[-2]) as o_f: tem = Wdsp(o_f) tem_seq = tem.seqs with open(sys.argv[-1]) as o_f: all1 = Wdsp(o_f) all_seq = all1.seqs similarity = OrderedDict() for t_name, t_seq in tem_seq.iteritems(): sim = [] for a_name, a_seq in all_seq.iteritems(): sim.append((a_name, align(t_seq, a_seq))) # sim = sorted(sim, key=operator.itemgetter(1),reverse=True) similarity[t_name] = sim for k, v in similarity.iteritems(): with lt.open_file(k) as w_f: for a_name, a_identity in v: print >> w_f, '{0:<15}{1:<}'.format(a_name, a_identity)
def main(): with open(sys.argv[-1]) as wdsp_f: wdsp = Wdsp(wdsp_f) a = PatchSearchSpecific(wdsp.pros, wdsp.seqs, wdsp.wdsps, wdsp.hotspots, cutoff=1) a.get_patches() a.classify_patches() a.write_results()
def main(): with open('test.wdsp') as wdsp_f: wdsp = Wdsp(wdsp_f) a = PatchSearch(wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots,2) a.get_patches() a.classify_patches() write_results(a.shape_patch_pros,wdsp.pros,wdsp.seqs,wdsp.wdsps,wdsp.hotspots) print 'shape_pro_patches' print a.shape_pro_patches print 'shape_patch_pros' print a.shape_patch_pros
def classify_blade(wdsp_f): with open(wdsp_f) as o_f: wdsp = Wdsp(o_f) hotspots = ' '.join( [' '.join(v) for k, v in wdsp.hotspots.iteritems()]).split() aa = { 'K', 'R', 'H', 'D', 'E', 'F', 'W', 'Y', 'S', 'T', 'N', 'Q', 'V', 'L', 'I', 'M', 'A', 'C', 'P', 'G', '*', 'X', 'B' } aa_combi = itertools.product(aa, repeat=3) blades = {} for c in aa_combi: blades[''.join(c)] = 0 for hot in hotspots: blades[hot] += 1 blades = [(v, k) for k, v in blades.iteritems()] blades = sorted(blades, reverse=True) return blades
#!/usr/bin/env python # -*- coding: utf-8 -*- """ select wd40s with repeats 6n,7n,8n """ from wdsp import Wdsp import numpy as np from numpy.random import randint with open('wd648_uniprot_select_cd-hit_90_7.wdsp') as wdsp_f: wdsp = Wdsp(wdsp_f) pro_num = len(wdsp.pros) for i in range(100): with open('random_' + str(i) + '.fa', 'w') as w_f: for j in range(2): pro = wdsp.pros[randint(0, pro_num)] seq = wdsp.seqs[pro] print >> w_f, '> ', pro for s in [seq[i:i + 80] for i in range(0, len(seq), 80)]: print >> w_f, s
def read_hots(wdsp_f): with open(sys.argv[-2]) as wdsp_f: wdsp = Wdsp(wdsp_f) hots = wdsp.hotspots return hots
#!/usr/bin/env python # -*- coding: utf-8 -*- """ usage: python get_hotspot.py *.wdsp output hotspots in following format pro xxx xxx xxx xxx xxx xxx """ import os import sys import lt from wdsp import Wdsp with open(sys.argv[-1]) as wdsp_f: w = Wdsp(wdsp_f) with lt.open_file(file_suffix='hotspots') as w_f: for pro, hots in w.hotspots.iteritems(): print >> w_f, '{0:<25}{1:<}'.format(pro, ' '.join(hots))
# visual_style['vertex_label'] = labels # visual_style['vertex_label_size'] = 2 visual_style['layout'] = graph.layout('kk') filename = 'sim_top_seq_' + str(cluster_num) + '_' + str(label) igraph.plot(graph, filename + '_graph.png', **visual_style) # slope,intercept,rvalue,pvalue,stderr = linregress(seqs_score,hots_score) # return [slope,intercept,rvalue,pvalue,stderr] # plot_scatter(seqs_score,hots_score,filename+'_scatter') # hots = adjust_hots(nr_hots) # hots = [(pro,''.join(hot)) for pro,hot in hots] # plotlogo(hots,filename+'_logo') with open(sys.argv[-1]) as wdsp_f: all_wdsp = Wdsp(wdsp_f) all_hots = all_wdsp.hotspots all_seqs = all_wdsp.seqs pros = all_wdsp.pros import lt @lt.run_time def main(): for cluster_num in range(10, 100, 50): clusters = [] for i in range(10): clusters.append( [[pros[randint(0, len(pros))] for i in range(cluster_num)], i])
def main(): fname = os.path.split(sys.argv[-1])[1].split('.')[0] with open(sys.argv[-1]) as wdsp_f: wdsp = Wdsp(wdsp_f) pros = wdsp.pros hots = wdsp.hotspots seqs = wdsp.seqs parameters = [] for i1, pro1 in enumerate(pros): for i2, pro2 in enumerate(pros): if i2 > i1: parameters.append([ pro1, pro2, hots[pro1], hots[pro2], seqs[pro1], seqs[pro2] ]) p = Pool(6) result = p.map(top_seq_align, parameters) p.close() # # result = [] # # for p in parameters: # # r = top_seq_align(p) # # result.append(r) hots_score = [r[2][1] for r in result] seqs_score = [r[3][1] for r in result] pickle.dump([hots_score, seqs_score], open('hots_seqs_score.pickle', 'w')) hots_score, seqs_score = pickle.load(open('hots_seqs_score.pickle')) plot_scatter(seqs_score, hots_score, fname + '_scatter') hots = [[pro, hot] for pro, hot in hots.iteritems()] hots = adjust_hots(hots) hots = [(pro, ''.join(hot)) for pro, hot in hots] plotlogo(hots, fname + '_logo') regression = linregress(seqs_score, hots_score) with open(fname + '_regression.txt', 'w') as w_f: 'slop,intercept,r-value,p-value,stderr' print >> w_f, ';'.join(map(str, regression)) f, ax = plt.subplots() fig = plt.figure(figsize=(5, 4)) ax = fig.add_subplot(111) # sns.distplot(hots_score,hist=False,label='Topface',kde_kws={'linestyle':'-.'}) # sns.distplot(seqs_score,hist=False,label='Sequence',kde_kws={'linestyle':'--'}) sns.distplot(hots_score, hist=False, label='Topface', kde_kws={'marker': ' '}) sns.distplot(seqs_score, hist=False, label='Sequence', kde_kws={'marker': '*'}) ax.set(xlabel='Similarity', ylabel='Frequency', title='WD40 Protein Topface and Sequence Similarity') # h.figure.subplots_adjust(top=0.9,bottom=0.05,left=0.18,right=0.98) plt.savefig(fname + 'hot_seq_similarity_dist.png', dpi=300)
#!/usr/bin/env python # -*- coding: utf-8 -*- """ calculate statistics for WDSP output file usage: python wdsp_sta.py wdsp_f """ import lt import sys import os from wdsp import Wdsp with open(sys.argv[-1]) as o_f: wdsp = Wdsp(o_f) scores_sta = lt.lis_sta(wdsp.scores.values()) with lt.open_file(file_suffix='total_score_sta') as w_f: for num, freq in scores_sta: print >> w_f, '{0:<10}{1}'.format(num, freq) tetrad_sta = [ len([vi for vi in v if vi >= 44.0]) for k, v in wdsp.blade_scores.iteritems() ] tetrad_sta = lt.lis_sta(tetrad_sta) with lt.open_file(file_suffix='tetrad_num_sta') as w_f: for num, freq in tetrad_sta: print >> w_f, '{0:<5}{1}'.format(num, freq) blades_sta = [len(blades) for pro, blades in wdsp.blades.iteritems()] blades_sta = lt.lis_sta(blades_sta)