def main(): res_neighbors = read_pickle() with lt.open_file(file_name='hbplus_salt_combine_initial') as w_f: for pro, phos_res, neighbor_res in res_neighbors: print >> w_f, '{0:<8}{1:<15}{2}'.format(pro, phos_res, ', '.join(neighbor_res)) lt.pickle_dump(res_neighbors, 'hbplus_salt_combine')
def main(): hbs = [] for hb2_f in lt.files_in_dir(sys.argv[-1]): if hb2_f[-4:] == '.hb2': hbs.extend(read_hb2(hb2_f)) write_initial(hbs) hbs = [(pro, phos_res, neighbor_res) for pro, phos_res, neighbor_res, _ in hbs] lt.pickle_dump(hbs, 'hbplus')
def mds(scores_matrix): seed = np.random.RandomState(seed=3) mds = manifold.MDS(n_components=2, max_iter=3000000, eps=1e-9, dissimilarity='precomputed', n_jobs=1) pos = mds.fit(scores_matrix).embedding_ x = [p[0]*100 for p in pos] y = [p[1]*100 for p in pos] lt.pickle_dump(x,'pos_x') lt.pickle_dump(y,'pos_y') fig = plt.figure(1) plt.scatter(x, y) plt.savefig('cluster_mds.png') plt.close()
def rcsb_uniprot(): keywords = [ 'pfam', 'smart', 'supfam', 'uniprot_repeat', 'uniprot_keyword', 'prosite1', 'prosite2', 'prosite3' ] p = Pool(8) result = p.map(uniprot_wd40, keywords) p.close() wd40s = [] for k in keywords: for r, v in result: if r == k: wd40s.append(v) wdsp = get_wdsp_acc() wd40s.append(wdsp) keywords.append('wdsp') total = set.union(*map(set, wd40s)) # if an entry apears in n different querys, its score is n wd40s_score = [[] for i in range(9)] def acc_score(acc): i = 0 for w in wd40s: if acc in w: i += 1 return i for acc in total: num = acc_score(acc) wd40s_score[num - 1].append(acc) # use acc to search rcsb resolution = 30.0 results = [] for beta in range(24): for chain_len in range(100, 240, 10): filename = str(beta) + '_' + str(chain_len) uniprot_pdbids, report = rcsb_acc_customreport( total, beta, chain_len, resolution) pdb_scores = [] for p in report[1:]: pdb_scores.append(p + [acc_score(p[2])]) pdb_scores = sorted(pdb_scores, key=lambda x: x[-1], reverse=True) lt.pickle_dump(pdb_scores, filename + '_pdb_scores') with open(filename + '_uniprot_pdb_scores.txt', 'w') as w_f: print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format( 'acc', 'pdb', 'chain', 'entity', 'resolution', 'chain_len', 'release', 'score') for p in pdb_scores: print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format( p[2], p[0], p[1], p[3], p[4], p[5], p[6], p[7]) results.append([str(beta) + '_' + str(chain_len), pdb_scores]) # plot trending barplot keys = [r[0] for r in results] data = map(len, [r[1] for r in results]) df = pd.DataFrame({'Parameters': keys, 'Num': data}) df = df.sort_values('Num', ascending=True) sns.set_color_codes('pastel') h = sns.barplot(y='Parameters', x='Num', data=wd, color='b') h.figure.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95) ax.set(xlabel='Parameters', ylabel='Num', title='WD40 Structures got with Different Parameters') plt.savefig('wd40_structures_got_by_different_parameters', dpi=300) plt.close('all') # get structures filtered by more strict parameters results = sorted(results, key=lambda x: len(x[1])) for r1, r2 in zip(results[:-1], results[1:]): if len(r1[1]) < len(r2[1]): added = set(r2[1]).difference(set(r1[1])) filename = r2[0] + '_minus_' + r1[0] with open(filename + '.txt', 'w') as w_f: print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format( 'acc', 'pdb', 'chain', 'entity', 'resolution', 'chain_len', 'release', 'score') for p in added: print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format( p[2], p[0], p[1], p[3], p[4], p[5], p[6], p[7])
def main(): beta, chain_len, resolution = 16, 160, 30.0 uniprot_pdbids, pdb_scores = rcsb_uniprot(beta, chain_len, resolution) scop_pdbids = rcsb_scop(beta, chain_len, resolution) pfam_pdbids = rcsb_pfam(beta, chain_len, resolution) txt_pdbids = rcsb_pfam(beta, chain_len, resolution) uniprot = set([u.split(':')[0] for u in uniprot_pdbids.split(',')[:-1]]) scop = set([u.split(':')[0] for u in scop_pdbids.split(',')[:-1]]) pfam = set([u.split(':')[0] for u in pfam_pdbids.split(',')[:-1]]) txt = set([u.split(':')[0] for u in txt_pdbids.split(',')[:-1]]) # plot heatmap of WD40s shared by different methods sns.set_color_codes('pastel') table = [] keys = ['Uniprot', 'SCOP', 'Pfam', 'Text'] total = [uniprot, scop, pfam, txt] for w in total: row = [len(w.intersection(wr)) * 1.0 / len(w) for wr in total] table.append(row) data = pd.DataFrame(table, columns=keys, index=keys) fig = plt.figure() ax = fig.add_subplot(111) h = sns.heatmap(data, annot=True, fmt='.2f', cmap='Blues') h.figure.subplots_adjust(top=0.9, bottom=0.13, left=0.13, right=0.9) ax.set_xticklabels(keys, rotation=90) ax.set_yticklabels(keys[::-1], rotation=0) ax.set_title('Comaration of Different Annotation Methods') plt.savefig('Comaration_of Different_Annotation_Methods.png', dpi=300) plt.close('all') sns.set_color_codes('bright') venn2([uniprot, scop], ['UniProt', 'SCOP']) plt.savefig('uniprot_scop.png', dpi=300) plt.close('all') venn2([uniprot, pfam], ['UniProt', 'Pfam']) plt.savefig('uniprot_pfam.png', dpi=300) plt.close('all') venn2([uniprot, txt], ['UniProt', 'Text']) plt.savefig('uniprot_pfam.png', dpi=300) plt.close('all') venn2([uniprot, set.union(*[scop, pfam, txt])], ['UniProt', 'Pfam_SCOP_Text']) plt.savefig('uniprot_pfam_scop_txt.png', dpi=300) plt.close('all') venn2([pfam, txt], ['Pfam', 'Text']) plt.savefig('pfam_txt.png', dpi=300) plt.close('all') venn2([pfam, scop], ['Pfam', 'SCOP']) plt.savefig('pfam_scop.png', dpi=300) plt.close('all') venn3([pfam, scop, txt], ['Pfam', 'SCOP', 'Text']) plt.savefig('pfam_scop_txt.png', dpi=300) plt.close('all') lt.pickle_dump([uniprot, scop, pfam, txt], 'search_method') write_lis_lis([uniprot, scop, pfam, txt], 'rcsb_wd40_pdb', ['uniprot', 'scop', 'pfam', 'txt']) # plot barplot f, ax = plt.subplots() total = set.union(*map(set, [uniprot, scop, pfam, txt])) sns.set_color_codes('pastel') methods = ['UniProt', 'Pfam', 'Text', 'SCOP'] wd = pd.DataFrame({ 'Search Method': methods, 'Num': map(len, [total, total, total, total]) }) sns.barplot(x='Search Method', y='Num', data=wd, color='b') sns.set_color_codes('muted') wd = pd.DataFrame({ 'Search Method': methods, 'Num': map(len, [uniprot, pfam, txt, scop]) }) sns.barplot(x='Search Method', y='Num', data=wd, color='b') ax.set(xlabel='Search Method', ylabel='Num', title='WD40 Structures in RCSB') plt.savefig('wd40_in_RCSB_pdbs', dpi=300) plt.close('all')
def rcsb_uniprot(beta=15, chain_len=150, resolution=3.5): keywords = [ 'pfam', 'smart', 'supfam', 'uniprot_repeat', 'uniprot_keyword', 'prosite1', 'prosite2', 'prosite3' ] p = Pool(8) result = p.map(uniprot_wd40, keywords) p.close() wd40s = [] for k in keywords: for r, v in result: if r == k: wd40s.append(v) wdsp = get_wdsp_acc() wd40s.append(wdsp) keywords.append('wdsp') total = set.union(*map(set, wd40s)) # if an entry apears in n different querys, its score is n wd40s_score = [[] for i in range(9)] def acc_score(acc): i = 0 for w in wd40s: if acc in w: i += 1 return i for acc in total: num = acc_score(acc) wd40s_score[num - 1].append(acc) # use acc to search rcsb uniprot_pdbids, report = rcsb_acc_customreport(total, beta, chain_len, resolution) pdb_scores = [] for p in report[1:]: pdb_scores.append(p + [acc_score(p[2])]) pdb_scores = sorted(pdb_scores, key=lambda x: x[-1], reverse=True) lt.pickle_dump(pdb_scores, 'pdb_scores') with open('uniprot_pdb_scores.txt', 'w') as w_f: print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format( 'acc', 'pdb', 'chain', 'entity', 'resolution', 'chain_len', 'release', 'score') for p in pdb_scores: print >> w_f, '{0:<15}{1:<10}{2:<8}{3:<8}{4:<15}{5:<8}{6:<18}{7:<8}'.format( p[2], p[0], p[1], p[3], p[4], p[5], p[6], p[7]) # plot wd40 structures annotated by different database total_pdb = set([p[2] for p in pdb_scores]) wd40s_pdb = [[a for a in w if a in total_pdb] for w in wd40s] lt.pickle_dump(wd40s_pdb, 'pdb_acc_databases') f, ax = plt.subplots() keys = [ 'Pfam', 'SMART', 'Superfamily', 'UniProt_repeat', 'UniProt_keyword', 'Prosite1', 'Prosite2', 'Prosite3', 'WDSP' ] wd = pd.DataFrame({'Database': keys, 'Num': map(len, wd40s_pdb)}) wd = wd.sort_values('Num', ascending=False) sns.set_color_codes('pastel') h = sns.barplot(y='Database', x='Num', data=wd, color='b') h.figure.subplots_adjust(top=0.9, bottom=0.05, left=0.14, right=0.95) ax.set(xlabel='Database', ylabel='Num', title='WD40 Structures Annotated by Different Database') # plt.xticks(roation=90) plt.savefig('wd40_structures_accs_annotated_by_different_database', dpi=300) plt.close('all') write_lis_lis(wd40s_pdb, 'wd40_structures_accs_annotated_by_different_database', keys) # plot annotation score of wd40 structures pdb_acc_scores = [[] for i in range(9)] for p in pdb_scores: pdb_acc_scores[p[-1] - 1].append(p[2]) pdb_acc_scores = map(set, pdb_acc_scores) lt.pickle_dump(pdb_acc_scores, 'pdb_acc_scores') f, ax = plt.subplots() wd = pd.DataFrame({ 'Database Score': range(1, 10), 'Num': map(len, pdb_acc_scores) }) sns.set_color_codes('pastel') sns.barplot(x='Database Score', y='Num', data=wd, color='b') ax.set(xlabel='Database Score', ylabel='Num', title='Annotation Score of WD40 Structures') plt.savefig('wd40_structures_annotation_score_accs', dpi=300) plt.close('all') write_lis_lis(pdb_acc_scores, 'wd40_structures_annotation_score_accs', [str(i) for i in range(1, 10)]) print 'uniprot search is finished' return uniprot_pdbids, pdb_scores
def main(): res_neighbors = lt.pickle_load(sys.argv[-1]) #delete water res_neighbors_dw = [(pdb,res,[n for n in neighbors if not 'HOH' in n]) for pdb,res,neighbors in res_neighbors] res_neighbors_dw = [(pdb,res,neighbors) for pdb,res,neighbors in res_neighbors_dw if len(neighbors) > 0] write_result(res_neighbors_dw,'1_delete_water') #filter entry containing hetero residues res_neighbors_fh = [(pdb,res,[n for n in neighbors if n.split('_')[-1] != 'H']) for pdb,res,neighbors in res_neighbors_dw] res_neighbors_fh = [(pdb,res,neighbors) for pdb,res,neighbors in res_neighbors_fh if len(neighbors) > 0] write_result(res_neighbors_fh,'2_dw_filter_hetero') #special case: phos_res neighboring residues involving main-chain interaction is not considered res_neighbors_pm = [] for pdb,res,neighbors in res_neighbors_fh: res_id,res_chain = res.split('_')[1:3] new_neighbors = [] for n in neighbors: words=n.split('_') if words[3] == 'M' and words[2] == res_chain and words[1] == res_id: pass else: new_neighbors.append(n) res_neighbors_pm.append((pdb,res,new_neighbors)) write_result(res_neighbors_pm,'3_dw_fh_pm') #filter_entry containing only same-chain neighbors res_neighbors_fs = [] for pdb,res,neighbors in res_neighbors_pm: res_chain = [res.split("_")[2]] neighbors_chain = [n.split("_")[2] for n in neighbors] if set(res_chain)== set(neighbors_chain): pass else: res_neighbors_fs.append((pdb,res,neighbors)) write_result(res_neighbors_fs,'4_dw_fh_pm_fs') #change main-chain interaction residue as 'GLY' res_neighbors_cm = [] for pdb,res,neighbors in res_neighbors_fs: new_neighbors = [] for n in neighbors: words = n.split('_') if words[3] == 'M': words = ['GLY'] + words[1:3] else: words = words[0:3] new_neighbors.append('_'.join(words)) res_neighbors_cm.append((pdb,res,new_neighbors)) write_result(res_neighbors_cm,'4.0_dw_fh_fs_cm') write_sta_result(res_neighbors_cm,'4.0_dw_fh_pm_cm') #filter entry involves main-chain interaction res_neighbors_fm = [] for pdb,res,neighbors in res_neighbors_fs: interaction_type = [n.split('_')[3] for n in neighbors] if 'M' in interaction_type: pass else: res_neighbors_fm.append((pdb,res,neighbors)) write_result(res_neighbors_fm,'5_dw_fh_pm_fs_fm') write_sta_result(res_neighbors_fm,'5_dw_fh_pm_fs_fm') lt.pickle_dump(res_neighbors_dw,'hbplus_salt_combine_1_dw') lt.pickle_dump(res_neighbors_fh,'hbplus_salt_combine_2_dw_fh') lt.pickle_dump(res_neighbors_pm,'hbplus_salt_combine_3_dw_fh_pm') lt.pickle_dump(res_neighbors_fs,'hbplus_salt_combine_4_dw_fh_pm_fs') lt.pickle_dump(res_neighbors_cm,'hbplus_salt_combine_4.0_dw_fh_fs_cm') lt.pickle_dump(res_neighbors_fm,'hbplus_salt_combine_5_dw_fh_pm_fs_fm')