def observed_lengths(args): """ :param args: run arguments :return: draws histograms of lengths of either PDBTM or OPM win lengths and inter helix loops """ rost_db = parse_rostlab_db() lengths = [] between_helices_lengths = [] for k, v in rost_db.items(): topc = spc_parser(k) signal_peptide = topc['topcons'].count('S') + topc['topcons'].count( 's') obs_loc_list = pdbtm_opm_loc_list(v[args['data_base']], signal_peptide) for i, w in enumerate(obs_loc_list): lengths.append(w[1] - w[0]) if i + 1 in range(0, len(obs_loc_list)): between_helices_lengths.append(obs_loc_list[i + 1][0] - w[1]) plt.hist(lengths, 30, normed=1, facecolor='green', alpha=0.75) plt.hist(between_helices_lengths, 100, normed=1, facecolor='blue', alpha=0.5) plt.xlabel('Window lengths in %s dataset' % args['data_base']) plt.ylabel('Frequency') plt.xlim([0, 100]) plt.grid(True) plt.show()
def observed_lengths(args): """ :param args: run arguments :return: draws histograms of lengths of either PDBTM or OPM win lengths and inter helix loops """ rost_db = parse_rostlab_db() lengths = [] between_helices_lengths = [] for k, v in rost_db.items(): topc = spc_parser(k) signal_peptide = topc['topcons'].count('S') + topc['topcons'].count('s') obs_loc_list = pdbtm_opm_loc_list(v[args['data_base']], signal_peptide) for i, w in enumerate(obs_loc_list): lengths.append(w[1]-w[0]) if i+1 in range(0, len(obs_loc_list)): between_helices_lengths.append(obs_loc_list[i+1][0] - w[1]) plt.hist(lengths, 30, normed=1, facecolor='green', alpha=0.75) plt.hist(between_helices_lengths, 100, normed=1, facecolor='blue', alpha=0.5) plt.xlabel('Window lengths in %s dataset' % args['data_base']) plt.ylabel('Frequency') plt.xlim([0, 100]) plt.grid(True) plt.show()
def check_all_aa_points_as_boxplot(): from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt import numpy as np rostlab_dict = parse_rostlab_db() membranal = [] non_membranal = [] for k, v in rostlab_dict.items(): psipred = parse_psipred(k) for i, aa in enumerate(v['pdbtm']): if aa.lower() == 'h': membranal.append(psipred[i + 1]) else: non_membranal.append(psipred[i + 1]) positions = np.arange(6) plt.subplot(111) labels = [ 'MM Coil', 'MM sheet', 'MM Helix', 'no Coil', 'no sheet', 'no Helix' ] mm_coil = [a['c'] for a in membranal] mm_sheet = [a['e'] for a in membranal] mm_helix = [a['h'] for a in membranal] no_coil = [a['c'] for a in non_membranal] no_sheet = [a['e'] for a in non_membranal] no_helix = [a['h'] for a in non_membranal] plt.boxplot([mm_coil, mm_sheet, mm_helix, no_coil, no_sheet, no_helix], labels=labels, positions=positions) plt.ylim([-0.5, 1.5]) plt.show()
def check_all_aa_points_as_boxplot(): from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt import numpy as np rostlab_dict = parse_rostlab_db() membranal = [] non_membranal = [] for k, v in rostlab_dict.items(): psipred = parse_psipred(k) for i, aa in enumerate(v['pdbtm']): if aa.lower() == 'h': membranal.append(psipred[i+1]) else: non_membranal.append(psipred[i+1]) positions = np.arange(6) plt.subplot(111) labels = ['MM Coil', 'MM sheet', 'MM Helix', 'no Coil', 'no sheet', 'no Helix'] mm_coil = [a['c'] for a in membranal] mm_sheet = [a['e'] for a in membranal] mm_helix = [a['h'] for a in membranal] no_coil = [a['c'] for a in non_membranal] no_sheet = [a['e'] for a in non_membranal] no_helix = [a['h'] for a in non_membranal] plt.boxplot([mm_coil, mm_sheet, mm_helix, no_coil, no_sheet, no_helix], labels=labels, positions=positions) plt.ylim([-0.5, 1.5]) plt.show()
def main(): # check_all_aa_points_as_boxplot() from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt import numpy as np IS_BETA_CUTOFF = 0.3 BETA_NUM_CUTOFF = 5 missed_h = 0 rostlab_dict = parse_rostlab_db() for k, v in rostlab_dict.items(): psipred = parse_psipred(k) # print v['seq'] # print v['pdbtm'] tms = ts2hp_seq(v['seq'], v['pdbtm']) for tm in tms: are_beta = 0 for i in range(tm[0], tm[1] + 1): if psipred[i]['e'] >= IS_BETA_CUTOFF: are_beta += 1 if are_beta >= BETA_NUM_CUTOFF: missed_h += 1 print "MISSED ME!!!!", k, tm, [ psipred[a]['e'] for a in range(tm[0], tm[1] + 1) ], are_beta print "total misses (helices)", missed_h
def compare_just_one(): """ mode: one path: path to .prd name: protein id :return: compares a single prediction to it's database input. """ from TMpredict_WinGrade import parse_rostlab_db from topcons_result_parser import topcons2rostlab_ts_format M = 10 rostlab_db_dict = parse_rostlab_db() pred = prd_parser(args['path'], args['name']) obse = rostlab_db_dict[pred['name']] topc = spc_parser(pred['name']) predictors = {k: topcons2rostlab_ts_format(v) for k, v in topc.items() if k not in ['name', 'seq']} predictors_results = {k: None for k in topc.keys() if k not in ['name', 'seq']} # print 'in one' # print 'obse', obse['pdbtm'] # print 'topo', pred['pred_ts'] # print 'topcons', predictors['topcons'] for predictor in predictors: print 'predictor', predictor comp_pdbtm = comparer(obse['pdbtm'], predictors[predictor], M, predictors['topcons'], pred['seq']) comp_opm = comparer(obse['opm'], predictors[predictor], M, predictors['topcons'], pred['seq']) predictors_results[predictor] = comp_pdbtm['overlapM_ok'] or comp_opm['overlapM_ok'] comp_pdbtm = comparer(obse['pdbtm'], pred['pred_ts'], M, predictors['topcons'], pred['seq']) comp_opm = comparer(obse['opm'], pred['pred_ts'], M, predictors['topcons'], pred['seq']) if comp_opm['overlapM_ok'] and comp_pdbtm['overlapM_ok']: print 'TopoGraph is correct by both' elif comp_opm['overlapM_ok']: print 'TopoGraph is correct ONLY by OPM' elif comp_pdbtm['overlapM_ok']: print 'TopoGraph is correct ONLY by PDBTM' print 'com pdbtm', comp_pdbtm print 'com opm', comp_opm print predictors_results
def blast2fasta(): ''' :return: takes one blast .xml result from rost_msa_prep/blast and makes a multiple fasta file of the same sequences in the same folder ''' from TMpredict_WinGrade import parse_rostlab_db name = args['name'] path_bl = '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/' output_bl = path_bl + name + '_blast.xml' seq_dict = ncbiXML_parser(output_bl) print name query = {k: v for k, v in parse_rostlab_db().items() if k == name.split('_')[0]}.values()[0] # print query # print seq_dict with open('/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/' +name+'_blast.fa', 'wr+') as o: o.writelines('>%s\n' % name) o.writelines('%s\n' % query['seq']) for k, v in seq_dict.items(): o.writelines('>%s\n' % k) o.writelines('%s\n' % v['hit_seq']) with open('/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/names.txt', 'a') as o: o.write(name+'\n')
def blast2fasta(): ''' :return: takes one blast .xml result from rost_msa_prep/blast and makes a multiple fasta file of the same sequences in the same folder ''' from TMpredict_WinGrade import parse_rostlab_db name = args['name'] path_bl = '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/' output_bl = path_bl + name + '_blast.xml' seq_dict = ncbiXML_parser(output_bl) print name query = { k: v for k, v in parse_rostlab_db().items() if k == name.split('_')[0] }.values()[0] # print query # print seq_dict with open( '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/blast/' + name + '_blast.fa', 'wr+') as o: o.writelines('>%s\n' % name) o.writelines('%s\n' % query['seq']) for k, v in seq_dict.items(): o.writelines('>%s\n' % k) o.writelines('%s\n' % v['hit_seq']) with open( '/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/rost_msa_prep/names.txt', 'a') as o: o.write(name + '\n')
def main_rost(): prd_files = [a for a in os.listdir('./') if '.prd' in a and '_msa' not in a] rost_db = parse_rostlab_db() new_old = rost_new_old() topgraph_none = [] follow = 'q8dkp6' old_new_totals = {'new': 0, 'old': 0} results = {} for prd_file in prd_files: name = prd_file.split('.')[0].lower() best_wgp, sec_wgp = parse_prd(prd_file) if best_wgp is None: topgraph_none.append(name) continue topc = spc_parser(name) signal_peptide = topc['topcons'].count('S') + topc['topcons'].count('s') best_wgp_loc_list = wgp_to_loc_list(best_wgp, signal_peptide) sec_wgp_loc_list = wgp_to_loc_list(sec_wgp, signal_peptide) old_new_totals[new_old[name]] += 1 if name == follow: print 'at %s found loc list %r' % (name, best_wgp_loc_list) best_tgr_qok, best_tgr_ovm = qok_pdbtm_opm(rost_db[name], best_wgp_loc_list, signal_peptide, verbose=name==follow) sec_tgr_qok, sec_tgr_ovm = qok_pdbtm_opm(rost_db[name], sec_wgp_loc_list, signal_peptide) best_or_sec_qok = best_tgr_qok or sec_tgr_qok best_or_sec_ovm = best_tgr_ovm or sec_tgr_ovm results[name] = {'old_new': new_old[name], 'tm_num': len(pdbtm_opm_loc_list(rost_db[name]['pdbtm'], signal_peptide)), 'topgraph': {'qok': best_tgr_qok, 'ovm': best_tgr_ovm}, 'best_or_sec': {'qok': best_or_sec_qok, 'ovm': best_or_sec_ovm}} for predictor in predictors: prd_qok, prd_ovm = qok_pdbtm_opm(rost_db[name], ts_loc_list(topc[predictor], signal_peptide), signal_peptide) results[name][predictor] = {'qok': prd_qok, 'ovm': prd_ovm} # prints resutls sliced by old/new print_results_by_old_new(results, predictors, old_new_totals) # prints results sliced by 1, 2-4 >4 TMHs print_results_by_tm_num(results) # print names TopGraph got wrong print_names_topgraph_got_wrong(results) # prints namse TopGraph got wrong by both best and sec best print_names_topgraph_got_wrong_best_and_sec(results) # print total percentage correct for TopGraph, TopGraph best or sec, and TOPCONS print_total_results(results)
def main(): import os import re from TMpredict_WinGrade import result_comparer, results_writer, parse_rostlab_db, result_comparer_10overlap topcons_path = '/home/labs/fleishman/jonathaw/membrane_topcons/topo_VH_topcons/all_results/' rostlab_db_dict = parse_rostlab_db() file_list = [x for x in os.listdir(topcons_path) if re.match('.*\.txt', x)] for file_i in file_list: entry = topcons_parser(topcons_path, file_i) topo_string = entry['rost_format_scampi'] with open(topcons_path + entry['name'].lower() + '.prd', 'wr+') as o: o.writelines('name %s\n' % entry['name'].lower()) o.writelines('top %s\n' % topo_string)
def main(): import os import re from TMpredict_WinGrade import result_comparer, results_writer, parse_rostlab_db, result_comparer_10overlap topcons_path = '/home/labs/fleishman/jonathaw/membrane_topcons/topo_VH_topcons/all_results/' rostlab_db_dict = parse_rostlab_db() file_list = [x for x in os.listdir(topcons_path) if re.match('.*\.txt', x)] for file_i in file_list: entry = topcons_parser(topcons_path, file_i) topo_string = entry['rost_format_scampi'] with open(topcons_path+entry['name'].lower()+'.prd', 'wr+') as o: o.writelines('name %s\n' % entry['name'].lower()) o.writelines('top %s\n' % topo_string)
def download_pdbs(): """ :return: downloads all PDBs for the rostlab database. only the ones actually available... print the names of those it failed """ from Bio.PDB import PDBParser, PDBIO, PDBList from TMpredict_WinGrade import parse_rostlab_db rost_db = parse_rostlab_db() pdbl = PDBList() failed = [] for k, v in rost_db.items(): print k, v try: pdbl.retrieve_pdb_file(v['pdb'], pdir='PDB') except: failed.append(v['pdb']) print failed
def main(): # check_all_aa_points_as_boxplot() from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt import numpy as np IS_BETA_CUTOFF = 0.3 BETA_NUM_CUTOFF = 5 missed_h = 0 rostlab_dict = parse_rostlab_db() for k, v in rostlab_dict.items(): psipred = parse_psipred(k) # print v['seq'] # print v['pdbtm'] tms = ts2hp_seq(v['seq'], v['pdbtm']) for tm in tms: are_beta = 0 for i in range(tm[0], tm[1]+1): if psipred[i]['e'] >= IS_BETA_CUTOFF: are_beta += 1 if are_beta >= BETA_NUM_CUTOFF: missed_h += 1 print "MISSED ME!!!!", k, tm, [psipred[a]['e'] for a in range(tm[0], tm[1]+1)], are_beta print "total misses (helices)", missed_h
def main(): from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import operator psi_helix = [0.001, 0.005, 0.01, 0.1, 0.2, 0.3, 0.4] psi_res_num = [1, 2, 3, 4] total = 0 results = {} for ph in psi_helix: for prn in psi_res_num: results[(ph, prn)] = 0 rostlab_dict = parse_rostlab_db() for name, dict in rostlab_dict.items(): psipred = PsiReaderHelix('/home/labs/fleishman/jonathaw/membrane_prediciton/data_sets/rostlab_db/psipred/' +name+'.ss2') assert len(psipred) == len(dict['seq']) == len(dict['pdbtm']) == len(dict['opm']), 'length unequal %s' % name for typ in ['pdbtm', 'opm']: helices = split_to_helices(dict['seq'], dict[typ], psipred) for h_seq, h_ss2 in helices: total += 1 for ph in psi_helix: for prn in psi_res_num: if pass_helix(h_ss2, ph, prn): results[(ph, prn)] += 1 fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # for par, res in results.items(): for par, res in sorted(results.items(), key=operator.itemgetter(1)): print 'psi_helix: %f psi_res_num %i result: %f' % (par[0], par[1], float(res)/float(total)) ax.scatter(par[0], par[1], float(res)/float(total)) ax.set_xlabel('psi_helix') ax.set_ylabel('psi_res_num') ax.set_zlabel('percent') plt.show()
def main(): # check_all_aa_points_as_boxplot() from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt import numpy as np rostlab_dict = parse_rostlab_db() passed = [] didnt_pass = [] could_pass = [] for k, v in rostlab_dict.items(): # print k, v psipred = parse_psipred(k, v['seq']) tms = ts2hp_seq(v['seq'], v['pdbtm']) # print v['seq'] # print v['pdbtm'] # print ''.join([str(a) for a in range(10)]*50) # print tms for tm in tms: # print 'testing', tm if is_not_helical(v['seq'], tm, psipred): # print tm, 'didnt pass' didnt_pass.append(tm) if try_to_pass(tm, v['seq'], psipred): could_pass.append(tm) else: print v['name'], tm, np.mean([psipred[a]['e'] for a in range(tm[0], tm[1]+1)]), np.mean([psipred[a]['c'] for a in range(tm[0], tm[1]+1)]), np.mean([psipred[a]['h'] for a in range(tm[0], tm[1]+1)]) for i in range(tm[0], tm[1]+1): print 'c: %f h: %f e: %f' % (psipred[i]['c'], psipred[i]['h'], psipred[i]['e']) else: # print tm, 'has passed' passed.append(tm) # break print 'failed %i times' % len(didnt_pass) print 'could have passed %i' % len(could_pass) print 'succeeded %i times' % len(passed)
res = 0 for i in range(ind, ind+20): if ts[i].lower() == 'h': res += 1 return res / 20 >= 0.9 def is_not_helical(pos, psi): import numpy as np return False if (np.mean([psi[a]['e'] for a in range(pos[0], pos[1])]) <= 0.3 and np.mean([psi[a]['c'] for a in range(pos[0], pos[1])]) <= 0.48 and np.mean([psi[a]['h'] for a in range(pos[0], pos[1])]) >= 0.3) else True from psipred_vs_mm_nomm import parse_psipred, psipred_avg from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt rost_db = parse_rostlab_db() pdb_name = 'p02722' psi = parse_psipred(pdb_name) avg_b = [] avg_c = [] avg_h = [] indices = [] tmh = [] passed = [] for i in range(len(psi)-20): avg_b.append(psipred_avg(range(i+1, i+21), psi, 'e')) avg_c.append(psipred_avg(range(i+1, i+21), psi, 'c')) avg_h.append(psipred_avg(range(i+1, i+21), psi, 'h')) tmh.append(-0.1 if is_tmh(i, rost_db[pdb_name]['pdbtm']) else None)
def prd_directory(dir_path): """ :param dir_path: path to directory to analyse :return: if in ROC mode returns prediction results. if in single mode, shows a graph of the results """ import re, os # from TMpredict_WinGrade import parse_rostlab_db from topcons_result_parser import topcons2rostlab_ts_format import matplotlib.pyplot as plt import matplotlib import numpy as np M = 10 file_list = [x for x in os.listdir(dir_path) if re.match('.*\.prd', x) and '_msa' not in x] if len(file_list) < args['num_prd']: return {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0}, {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0} rostlab_db_dict = parse_rostlab_db() # print rostlab_db_dict predictors = ['polyphobius', 'topcons', 'spoctopus', 'philius', 'octopus', 'scampi', 'pred_ts'] results = {a: {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0} for a in predictors} totals = {'tm_1': 0, 'tm_2_5': 0, 'tm_5': 0} errors = {'over': 0, 'miss': 0, 'exact': 0, 'total': 0} we_got_wrong = [] we_got_right = [] topcons_got_right = [] topgraph_c_term, topcons_c_term, c_term_total = 0, 0, 0 for file_name in file_list: pred = prd_parser(dir_path, file_name) # if pred['name'] != 'p0c7b7': continue try: obse = rostlab_db_dict[pred['name']] topc = spc_parser(pred['name']) except: obse = rostlab_db_dict[pred['name'].lower()] topc = spc_parser(pred['name'].lower()) predictors = {k: topcons2rostlab_ts_format(v) for k, v in topc.items() if k not in ['name', 'seq']} predictors['pred_ts'] = pred['pred_ts'] first_passage = True topgraph_c_term += 1 if test_c_term(obse['pdbtm'], obse['opm'], predictors['pred_ts']) else 0 topcons_c_term += 1 if test_c_term(obse['pdbtm'], obse['opm'], predictors['topcons']) else 0 c_term_total += 1 for predictor in predictors: # print "predictor", predictor, pred['name'] comp_pdbtm = comparer(obse['pdbtm'], predictors[predictor], M, predictors['topcons'], pred['seq']) comp_opm = comparer(obse['opm'], predictors[predictor], M, predictors['topcons'], pred['seq']) overM = comp_pdbtm['overlapM_ok'] or comp_opm['overlapM_ok'] if predictor == 'pred_ts' and overM: we_got_right.append(pred['name']) if predictor == 'pred_ts' and not overM: # print 'AAAAAAAHHHHHHH !!!!!! :(' # print 'obse_tm_num', comp_pdbtm['obse_tm_num'] # print 'pred_tm_num', comp_pdbtm['pred_tm_num'] # print 'ok', comp_pdbtm['overlapM_ok_helices'] # print '\n' we_got_wrong.append(pred['name']) if comp_pdbtm['obse_tm_num'] > comp_pdbtm['pred_tm_num']: print 'MISS', pred['name'], comp_pdbtm['obse_tm_num'] errors['miss'] += 1 elif comp_pdbtm['obse_tm_num'] < comp_pdbtm['pred_tm_num']: print 'OVER', pred['name'], comp_pdbtm['obse_tm_num'] errors['over'] += 1 else: errors['exact'] += 1 errors['total'] += 1 print pred['name'], obse['pdb'] print 'pred_ts', predictors['pred_ts'] print 'AA seq ', pred['seq'] print 'pdbtm ', obse['pdbtm'] if predictor == 'topcons' and overM: topcons_got_right.append(pred['name']) if comp_pdbtm['obse_tm_num'] == 0 or comp_opm['obse_tm_num'] == 0: continue if comp_pdbtm['obse_tm_num'] != comp_opm['obse_tm_num']: if comp_pdbtm['overlapM_ok']: results[predictor][tm_num2range(comp_pdbtm['obse_tm_num'])] += 1 if first_passage: totals[tm_num2range(comp_pdbtm['obse_tm_num'])] += 1 elif comp_opm['overlapM_ok']: results[predictor][tm_num2range(comp_opm['obse_tm_num'])] += 1 if first_passage: totals[tm_num2range(comp_opm['obse_tm_num'])] += 1 else: results[predictor][tm_num2range(comp_opm['obse_tm_num'])] += 1 \ if (comp_pdbtm['overlapM_ok'] or comp_opm['overlapM_ok']) else 0 if first_passage: totals[tm_num2range(comp_pdbtm['obse_tm_num'])] += 1 first_passage = False if args['mode'] == 'ROC': data = {k: v for k, v in results['pred_ts'].items()} return data, totals else: print 'these are the names we got right:', we_got_right print 'results', results print 'totals', totals print 'errors', errors print 'at total topgrph got right', float(sum(results['pred_ts'].values())) / float(sum(totals.values())) print 'at total topcons got right', float(sum(results['topcons'].values())) / float(sum(totals.values())) print 'topcons got c_term right', topcons_c_term, 100.*topcons_c_term/c_term_total print 'topgraph got c_term right', topgraph_c_term, 100.*topgraph_c_term/c_term_total print 'total c_term tested', c_term_total print 'ASSAF!!!! TOPCONS GOT THESE RIGHT:', topcons_got_right plt.figure() data = {} for predictor, results_d in results.items(): data[predictor] = {k: 100*float(v)/float(totals[k]) for k, v in results_d.items()} print 'pps', results['polyphobius'] # font = {'family': 'normal', 'size': 22} # matplotlib.rc('font', **font) print data print 'range', np.arange(0, 1./3., 1./(7.*3.)), len(np.arange(0, 1./3., 1./(7.*3.))) ind = np.arange(3) width = 1./3. * (1./7.) incs = np.arange(0, 1./3., 1./(7.*3.)) colors = ['red', 'blue', 'green', 'black', 'orange', 'pink', 'grey'] print ind plots = {} for predictor, details, inc, col in zip(data.keys(), data.values(), incs, colors): # print predictor, details, inc plots[predictor] = plt.bar(ind + inc, details.values(), width, color=col) plt.ylim((0, 105)) plt.xlim((-0.15, 3.7)) plt.xticks(np.arange(3)+0.15, ['1', '2-5', '5<']) plt.xlabel('Number of TMH') plt.ylabel('Overlap 10 Accuracy (%)') plt.title('TMH prediction comparison') names = [k for k in plots.keys()] names[0] = 'TopoGraph' plt.legend(plots.values(), names, loc='upper right') plt.show()
def check_beta_average(): ''' main function here. tests every TM in the Rost data base for its average sheet propensity, also every non-TM. outputs the number of windows that will be discarded for each. :return: ''' from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt import numpy as np global IS_BETA_CUTOFF, IS_COIL_CUTOFF, IS_HELIX_CUTOFF IS_BETA_CUTOFF = 0.3 IS_COIL_CUTOFF = 0.48 IS_HELIX_CUTOFF = 0.3 tm_missed_h = 0 non_ym_missed_h = 0 how_many_non_tms = 0 tot_passed_of_tmh = 0 tot_NOT_passed_of_tmh = 0 tot_passed_of_NON_tmh = 0 tot_NOT_passed_of_NON_tmh = 0 rostlab_dict = parse_rostlab_db() for k, v in rostlab_dict.items(): psipred = parse_psipred(k) tms = ts2hp_seq(v['seq'], v['pdbtm']) for tm in tms: # avg = sum([psipred[a]['e'] for a in range(tm[0], tm[1]+1)]) / len(range(tm[0], tm[1]+1)) # avg = psipred_avg(range(tm[0], tm[1]+1), psipred, 'e') # avg_c = psipred_avg(range(tm[0], tm[1]+1), psipred, 'c') # avg_h = psipred_avg(range(tm[0], tm[1]+1), psipred, 'h') # med = psipred_median(range(tm[0], tm[1]+1), psipred) # avgs.append(avg) # if avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h <= IS_HELIX_CUTOFF: # print avg,avg_c,avg_h if not pass_thresholds(psipred, tm[0], tm[1]): tm_missed_h += 1 ## check how many non_tms will be canceled thanks to threshold non_tms = ts2non_tms(v['seq'], v['pdbtm']) for non_tm in non_tms: rng = range(non_tm[0], non_tm[1]+1) for i in rng: if i+20 in rng: # avg = psipred_avg(range(i, i+20), psipred, 'e') # avg_c = psipred_avg(range(i, i+20), psipred, 'c') # avg_h = psipred_avg(range(i, i+20), psipred, 'h') # med = psipred_median(range(i, i+20), psipred) how_many_non_tms += 1 # if (avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h >= IS_HELIX_CUTOFF): if not pass_thresholds(psipred, i, i+20): non_ym_missed_h += 1 for i in range(len(v['seq'])-20): if do_range_overlap_ranges(range(i+1, i+21), tms): if pass_thresholds(psipred, i+1, i+21): tot_passed_of_tmh += 1 else: tot_NOT_passed_of_tmh += 1 else: if pass_thresholds(psipred, i+1, i+21): tot_passed_of_NON_tmh += 1 else: tot_NOT_passed_of_NON_tmh += 1 # break print 'TM totalt misses (helices)', tm_missed_h print 'NON TM total misses (helices)', non_ym_missed_h print "overall %i non tms examined" % how_many_non_tms print "\nTotal helices passed and are TMHs %i, Total helices not pass and are TMHs %i" % (tot_passed_of_tmh, tot_NOT_passed_of_tmh) print "Total helices passed and are not TMHs %i, Total helices not pass and not TMHs %i" % (tot_passed_of_NON_tmh, tot_NOT_passed_of_NON_tmh)
def check_beta_average(): ''' main function here. tests every TM in the Rost data base for its average sheet propensity, also every non-TM. outputs the number of windows that will be discarded for each. :return: ''' from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt import numpy as np global IS_BETA_CUTOFF, IS_COIL_CUTOFF, IS_HELIX_CUTOFF IS_BETA_CUTOFF = 0.3 IS_COIL_CUTOFF = 0.48 IS_HELIX_CUTOFF = 0.3 tm_missed_h = 0 non_ym_missed_h = 0 how_many_non_tms = 0 tot_passed_of_tmh = 0 tot_NOT_passed_of_tmh = 0 tot_passed_of_NON_tmh = 0 tot_NOT_passed_of_NON_tmh = 0 rostlab_dict = parse_rostlab_db() for k, v in rostlab_dict.items(): psipred = parse_psipred(k) tms = ts2hp_seq(v['seq'], v['pdbtm']) for tm in tms: # avg = sum([psipred[a]['e'] for a in range(tm[0], tm[1]+1)]) / len(range(tm[0], tm[1]+1)) # avg = psipred_avg(range(tm[0], tm[1]+1), psipred, 'e') # avg_c = psipred_avg(range(tm[0], tm[1]+1), psipred, 'c') # avg_h = psipred_avg(range(tm[0], tm[1]+1), psipred, 'h') # med = psipred_median(range(tm[0], tm[1]+1), psipred) # avgs.append(avg) # if avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h <= IS_HELIX_CUTOFF: # print avg,avg_c,avg_h if not pass_thresholds(psipred, tm[0], tm[1]): tm_missed_h += 1 ## check how many non_tms will be canceled thanks to threshold non_tms = ts2non_tms(v['seq'], v['pdbtm']) for non_tm in non_tms: rng = range(non_tm[0], non_tm[1] + 1) for i in rng: if i + 20 in rng: # avg = psipred_avg(range(i, i+20), psipred, 'e') # avg_c = psipred_avg(range(i, i+20), psipred, 'c') # avg_h = psipred_avg(range(i, i+20), psipred, 'h') # med = psipred_median(range(i, i+20), psipred) how_many_non_tms += 1 # if (avg >= IS_BETA_CUTOFF or avg_c >= IS_COIL_CUTOFF or avg_h >= IS_HELIX_CUTOFF): if not pass_thresholds(psipred, i, i + 20): non_ym_missed_h += 1 for i in range(len(v['seq']) - 20): if do_range_overlap_ranges(range(i + 1, i + 21), tms): if pass_thresholds(psipred, i + 1, i + 21): tot_passed_of_tmh += 1 else: tot_NOT_passed_of_tmh += 1 else: if pass_thresholds(psipred, i + 1, i + 21): tot_passed_of_NON_tmh += 1 else: tot_NOT_passed_of_NON_tmh += 1 # break print 'TM totalt misses (helices)', tm_missed_h print 'NON TM total misses (helices)', non_ym_missed_h print "overall %i non tms examined" % how_many_non_tms print "\nTotal helices passed and are TMHs %i, Total helices not pass and are TMHs %i" % ( tot_passed_of_tmh, tot_NOT_passed_of_tmh) print "Total helices passed and are not TMHs %i, Total helices not pass and not TMHs %i" % ( tot_passed_of_NON_tmh, tot_NOT_passed_of_NON_tmh)
def is_not_helical(pos, psi): import numpy as np return False if ( np.mean([psi[a]['e'] for a in range(pos[0], pos[1])]) <= 0.3 and np.mean([psi[a]['c'] for a in range(pos[0], pos[1])]) <= 0.48 and np.mean([psi[a]['h'] for a in range(pos[0], pos[1])]) >= 0.3) else True from psipred_vs_mm_nomm import parse_psipred, psipred_avg from TMpredict_WinGrade import parse_rostlab_db import matplotlib.pyplot as plt rost_db = parse_rostlab_db() pdb_name = 'p02722' psi = parse_psipred(pdb_name) avg_b = [] avg_c = [] avg_h = [] indices = [] tmh = [] passed = [] for i in range(len(psi) - 20): avg_b.append(psipred_avg(range(i + 1, i + 21), psi, 'e')) avg_c.append(psipred_avg(range(i + 1, i + 21), psi, 'c')) avg_h.append(psipred_avg(range(i + 1, i + 21), psi, 'h')) tmh.append(-0.1 if is_tmh(i, rost_db[pdb_name]['pdbtm']) else None)
def analyse(): import pickle import os import sys import operator import random import matplotlib.pyplot as plt from TMpredict_WinGrade import parse_rostlab_db from WinGrade import topo_string_to_WGP from topo_strings_comparer import prd_parser, spc_parser total_sasa_dict = parse_standard_data() rost_db = parse_rostlab_db() neighbors = 0 single = 0 without_neighbours = 0 accessible_vs_ddg = [] for k, v in rost_db.items(): if v['name'] in ['p01730', 'p19054', 'e1c9k9', 'q9qug3', 'p07471']: continue is_single = os.path.isfile('single_chains/%s_1.pdb' % v['pdb']) is_neighbor = os.path.isfile( 'with_neighbours/%s_%s_with_neighbors.pdb' % (v['pdb'], v['chain'].upper())) is_no_neighbour = os.path.isfile( 'without_neighbours/%s_%s_without_neighbours.pdb' % (v['pdb'], v['chain'].upper())) neighbors += 1 if is_neighbor else 0 without_neighbours += 1 if is_no_neighbour else 0 single += 1 if is_single else 0 # before cutting out the spring: # prediction = prd_parser('/home/labs/fleishman/elazara/benchmark_paper_new/Mean/Plain', v['name']+'.prd') # after cutting out spring, with MSA: # prediction = prd_parser('/home/labs/fleishman/elazara/length_21/w_0_with_MSA/', v['name']+'.prd') # after cutting out spring, without MSA: prediction = prd_parser('/home/labs/fleishman/elazara/length_21/', v['name'] + '.prd') wgp_pred = topo_string_to_WGP(prediction['best_path_ts'], v['seq']) spoc = spc_parser(v['name'])['spoctopus'] signal = [0, spoc.count('s') + spoc.count('S')] if is_single: naccess = parse_rsa('single_chains/%s_1.rsa' % v['pdb']) else: naccess = parse_rsa('with_neighbours/%s_%s_with_neighbors.rsa' % (v['pdb'], v['chain'].upper())) wgp_pdbtm = topo_string_to_WGP(v['pdbtm'], v['seq']) rost_aln, naccess_aln, score, beg, end = \ pair_wise_aln_from_seqs(v['seq'], ''.join([a['type'] for a in naccess[v['chain']].values()])) for w in wgp_pdbtm.path: if w.begin <= signal[1]: print 'signes', w.begin, w.end, signal continue naccess_win = nacces_for_win(naccess, w, naccess_aln, rost_aln, total_sasa_dict, v['chain']) predicted = observed_found_in_prediction(w, wgp_pred) accessible_vs_ddg.append({ 'access': naccess_win, 'predicted': predicted, 'grade': w.grade }) with open('pickled.obj', 'wb') as pkl: pickle.dump(accessible_vs_ddg, pkl) print 'its pickled'
''' result = {} print file_name with open(file_name, 'r') as f: cont = f.read().split('\n') for item in cont: split = item.split() if len(split) > 1: result[split[0]] = split[1] return result if __name__ == '__main__': import argparse import os from TMpredict_WinGrade import parse_rostlab_db global args parser = argparse.ArgumentParser() parser.add_argument('-name', type=str) parser.add_argument('-path', default=os.getcwd(), type=str) parser.add_argument('-tech', default='opm', type=str) args = vars(parser.parse_args()) # pymol_mark_segments('4k1c', 'a', '222222222222222222222222222222222222222HHHHHHHHHHHHHHHHHHH1HHHHHHHHHHHHHHHHHHHHHH22222222222222HHHHHHHHHHHHHHHHHHHHH11HHHHHHHHHHHHHHHHHHHHHH22222222222222222HHHHHHHHHHHHHHHHHHHHHHHHH1111111111111111HHHHHHHHHHHHHHHHHHHHH2222222222222222222222222222HHHHHHHHHHHHHHHHHHHHHHHHH111111111111111111111111111111111111111111111111111111111111111111111111111111111111HHHHHHHHHHHHHHHHHH222222HHHHHHHHHHHHHHHHHH1111111111111', # 'u111111111111111111111111111111111hhhhhhhhhhhhhhhhhh222222222222hhhhhhhhhhhhhhhhhhhhhh111111hhhhhhhhhhhhhhhhhhhh2222222222hhhhhhhhhhhhhhhhhhhhhh111111111111111hhhhhhhhhhhhhhhhh2222222222222222222222222hhhhhhhhhhhhhhhhhh1111111111111111111111111111111hhhhhhhhhhhhhhhhhhh222222222222222222222hhhhhhhhhhhhhhhhhh1111111111hhhhhhhhhhhhhhhhhhhhhh2222222222222222hhhhhhhhhhhhhhhhh111hhhhhhhhhhhhhhhhhh22222222222222222', # 'MDATTPLLTVANSHPARNPKHTAWRAAVYDLQYILKASPLNFLLVFVPLGLIWGHFQLSHTLTFLFNFLAIIPLAAILANATEELADKAGNTIGGLLNATFGNAVELIVSIIALKKGQVRIVQASMLGSLLSNLLLVLGLCFIFGGYNRVQQTFNQTAAQTMSSLLAIACASLLIPAAFRATLPHGKEDHFIDGKILELSRGTSIVILIVYVLFLYFQLGSHHALFEQQEEETDEVMSTISRNPHHSLSVKSSLVILLGTTVIISFCADFLVGTIDNVVESTGLSKTFIGLIVIPIVGNAAEHVTSVLVAMKDKMDLALGVAIGSSLQVALFVTPFMVLVGWMIDVPMTLNFSTFETATLFIAVFLSNYLILDGESNWLEGVMSLAMYILIAMAFFYYPDEKTLDSIGNSL') # entry = TMpredict_reader('/home/labs/fleishman/jonathaw/membrane_prediction_DBs/ROC_6.4.2015/ROC_-3.0_18_0.2_2/p00423.prd') entry = TMpredict_reader(args['path']+'/'+args['name']+'.prd') # print entry rostlab_data = parse_rostlab_db()[args['name']] # print 'aaa', rostlab_data pymol_mark_segments(rostlab_data['pdb'], rostlab_data['chain'], entry['pred_ts'], rostlab_data[args['tech']], rostlab_data['seq'], args['tech'])
def main_rost(): prd_files = [ a for a in os.listdir('./') if '.prd' in a and '_msa' not in a ] rost_db = parse_rostlab_db() new_old = rost_new_old() topgraph_none = [] follow = 'q8dkp6' old_new_totals = {'new': 0, 'old': 0} results = {} for prd_file in prd_files: name = prd_file.split('.')[0].lower() best_wgp, sec_wgp = parse_prd(prd_file) if best_wgp is None: topgraph_none.append(name) continue topc = spc_parser(name) signal_peptide = topc['topcons'].count('S') + topc['topcons'].count( 's') best_wgp_loc_list = wgp_to_loc_list(best_wgp, signal_peptide) sec_wgp_loc_list = wgp_to_loc_list(sec_wgp, signal_peptide) old_new_totals[new_old[name]] += 1 if name == follow: print 'at %s found loc list %r' % (name, best_wgp_loc_list) best_tgr_qok, best_tgr_ovm = qok_pdbtm_opm(rost_db[name], best_wgp_loc_list, signal_peptide, verbose=name == follow) sec_tgr_qok, sec_tgr_ovm = qok_pdbtm_opm(rost_db[name], sec_wgp_loc_list, signal_peptide) best_or_sec_qok = best_tgr_qok or sec_tgr_qok best_or_sec_ovm = best_tgr_ovm or sec_tgr_ovm results[name] = { 'old_new': new_old[name], 'tm_num': len(pdbtm_opm_loc_list(rost_db[name]['pdbtm'], signal_peptide)), 'topgraph': { 'qok': best_tgr_qok, 'ovm': best_tgr_ovm }, 'best_or_sec': { 'qok': best_or_sec_qok, 'ovm': best_or_sec_ovm } } for predictor in predictors: prd_qok, prd_ovm = qok_pdbtm_opm( rost_db[name], ts_loc_list(topc[predictor], signal_peptide), signal_peptide) results[name][predictor] = {'qok': prd_qok, 'ovm': prd_ovm} # prints resutls sliced by old/new print_results_by_old_new(results, predictors, old_new_totals) # prints results sliced by 1, 2-4 >4 TMHs print_results_by_tm_num(results) # print names TopGraph got wrong print_names_topgraph_got_wrong(results) # prints namse TopGraph got wrong by both best and sec best print_names_topgraph_got_wrong_best_and_sec(results) # print total percentage correct for TopGraph, TopGraph best or sec, and TOPCONS print_total_results(results)