def get_ppv_hbond(fasta_filename, c_filename, hbond_filename, factor=1.0, min_score=-1.0, sep=' ', outfilename=''): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if min_score == -1.0 and count >= ref_len * factor: break if score < min_score: break ref_contact_map = np.zeros((ref_len, ref_len)) hbonds_raw = open(hbond_filename).readlines() hbonds = [line.strip().split(' ') for line in hbonds_raw] #map(split(' '), map(strip, hbonds_raw)) for h in hbonds: i = int(h[0]) - 1 j = int(h[1]) - 1 val = float(h[2]) ref_contact_map[i, j] = -val ref_contact_map[j, i] = -val PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) print '%s %s %s %s' % (hbond_filename, PPV, TP, FP) return (hbond_filename, PPV, TP, FP)
def get_scores_from_contacts(c_filename, min_dist, factor_value, min_score, ref_len): """ Return a tupla unpacking of three lists, [contacts_x], [contacts_y], [scores]. """ # get separator from c_filename sep = get_separator(c_filename) # get a list ranked predicted contacts for those # carbon-beta (CB) that are 5 residues separated. # In the function "parse_contacts.parse()", min_dist is 5 for default. # This returns parse_contacts.parse: [(score, resA_CB, resB_CB)] contacts = parse_contacts.parse(c_filename, sep, min_dist) # Build a list for each residue numparseber # e.g.: from resA_CB to resA_numN ---> contacts_x = [resA_num1, ..., resA_numN] # e.g.: from resB_CB to resB_numN ---> contacts_y = [resB_num1, ..., resB_numN] contacts_x = [] contacts_y = [] # Build a score list scores = [] num_c = len(contacts) count = 0 for i in range(num_c): score = contacts[i][0] # It use "- 1" because the calling gives the real biological position # of the residue and python start counting at zero. c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 # Calculate the distance in the sequence position between resA and resB pos_diff = abs(c_x - c_y) # Boolean declaration with the distances. # Check if those are less than 5 residues far from each other. too_close = pos_diff < min_dist if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 # Check if the contact predicted is below than min_score # and the count are grater or equal than ref_len*factor_value (default, len*1.0) if min_score == -1.0 and count >= ref_len * factor_value: break if score < min_score: break return contacts_x, contacts_y, scores
# 10 steps during the run, and write stats every 10 steps md.optimize(atmsel, temperature=300, max_iterations=50, actions=[ actions.write_structure(10, query_id + '.D9998%04d.pdb'), actions.trace(10, trcfil) ]) # Finish off with some more CG, and write stats every 5 steps cg.optimize(atmsel, max_iterations=20, actions=[actions.trace(5, trcfil)]) mpdf = atmsel.energy() mdl.write(file=query_id + '.D00000001.pdb') contacts = parse_contacts.parse(open(contact_filename, 'r')) count = 0 seq_len = len(aln[query_id]) for (score, i, j) in contacts: rsr.add( forms.gaussian(group=physical.xy_distance, feature=features.distance(mdl.atoms['CA:%d' % i], mdl.atoms['CA:%d' % j]), mean=10.0, stdev=2)) #rsr.add(MyFade(group=physical.xy_distance, # feature=features.distance(mdl.atoms['CA:%d' % i], # mdl.atoms['CA:%d' % j]), # cutoff_lower=-100, cutoff_upper=100, fade_zone=92, well_depth=-150)) is_gly_a = aln[query_id].residues[i - 1].code == 'G' is_gly_b = aln[query_id].residues[j - 1].code == 'G'
def plot_map(fasta_filename, c_filename, factor, c2_filename='', ss_fname='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''): acc = fasta_filename.split('.')[0][:4] # get sequence seq = list(parse_fasta.read_fasta(open(fasta_filename, 'r')).values())[0][0] ref_len = len(seq) # get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] # contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 # Also checking here, # should remove in parse # too_close = False if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break # start plotting fig = plt.figure() ax = fig.add_subplot(111) # plot secondary structure on the diagonal if given if psipred_horiz_fname or psipred_vert_fname or ss_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) elif psipred_vert_fname: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) else: ss = parse_ss.parse(open(ss_fname, 'r')) assert len(ss) == ref_len for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue # plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in range(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print('%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1])) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC') # plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] # contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break # use TP/FP color coding if reference contacts given if pdb_filename: PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print('%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1])) fig.suptitle('%s\nPPV (upper left) = %.2f |' % (PPVs[-1]) + 'PPV (lower right) = %.2f' % (PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5) # plot predicted contacts from first contact map on both triangles # if no second contact map given else: if pdb_filename: fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) plt.colorbar(sc) plt.gca().set_xlim([0, ref_len]) plt.gca().set_ylim([0, ref_len]) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() elif outfilename.endswith(('.png', '.jpg', '.jpeg')): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close() plt.show()
sys.path.append("/home/x_arnel/git/bioinfo-toolbox/") from parsing import parse_fasta from parsing import parse_contacts sfile = sys.argv[1] cfile = sys.argv[2] target = sys.argv[3] server = sys.argv[4] ofilepath = sys.argv[5] minsep = sys.argv[6] minscore = sys.argv[7] seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0] contacts = parse_contacts.parse(open(cfile), min_dist=0) print len(contacts) print contacts[0] print seq ofile = open(ofilepath, 'w') if server == "Pcons-net": ofile.write( "PFRMAT RR\nTARGET %s\nAUTHOR 5450-4562-0389\nMETHOD Pcons-net\nREMARK PconsC3\nMETHOD Improved contact predictions on\nMETHOD small protein families.\nMODEL 1\n" % target) elif server == "PconsC2": ofile.write( "PFRMAT RR\nTARGET %s\nAUTHOR 4146-6019-9011\nMETHOD PconsC2\nREMARK PconsC2\nMETHOD Improved contact predictions using the\nMETHOD recognition of protein like contact\nMETHOD patterns.\nMODEL 1\n" % target)
def get_ppv(fasta_filename, c_filename, pdb_filename, factor=1.0, min_score=-1.0, chain='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, print_dist=False): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top ranked predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < min_dist if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if min_score == -1.0 and count >= ref_len * factor: break if score < min_score: break assert(len(contacts_x) == len(contacts_y) == len(scores)) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) bfactor = parse_pdb.get_area(open(pdb_filename, 'r'), chain) surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain) if noalign: dist_mat = get_cb_contacts(cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) else: atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] gapped_cb_lst = [] ali_lst =[] j = 0 k = 0 for i in xrange(len(atom_seq_ali)): #print i,j,k,seq_ali[i],atom_seq_ali[i] if atom_seq_ali[i] == '-': gapped_cb_lst.append(['-']) ali_lst.append(-9999) k += 1 elif seq_ali[i] == '-': j += 1 continue else: ali_lst.append(j) gapped_cb_lst.append(cb_lst[j]) k += 1 j += 1 dist_mat = get_cb_contacts(gapped_cb_lst) area = parse_pdb.get_area(open(pdb_filename, 'r'), chain) surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain) if print_dist: print_distances(contacts_x, contacts_y, scores, dist_mat, area, surf, ref_len,ref_len, seq, ali_lst=ali_lst, atom_seq=atom_seq, outfile=outfilename) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali) if name: print '%s %s %s %s' % (name, PPV, TP, FP) else: print '%s %s %s %s %s' % (fasta_filename, c_filename, PPV, TP, FP) return (pdb_filename, PPV, TP, FP)
def get_ppv(fasta_filenameA, c_filename, pdb_filenameA, fasta_filenameB, pdb_filenameB, factor=1.0, min_score=-1.0, chainA='', chainB='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, interfacelen=10, print_dist=False, cutoff=0.25): ### get sequence seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0] seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0] seq = seqA + seqA # Actually the contact map sequence is just two copies of seqA ref_lenA = len(seqA) ref_lenB = len(seqB) ref_len = len(seq) ### get top ranked predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist) contacts_x = [] contacts_y = [] scores = [] contactsA_x = [] contactsA_y = [] scoresA = [] contactsB_x = [] contactsB_y = [] scoresB = [] contactsI_x = [] contactsI_y = [] scoresI = [] contact_dict = {} count = 0 countA = 0 countB = 0 countI = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 #print i,c_x,c_y,score pos_diff = abs(c_x - c_y) too_close = pos_diff < min_dist if not too_close: # The contacts only covers contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) #contacts_x.append(c_x+ref_lenA) #contacts_y.append(c_y+ref_lenA) #scores.append(score) contactsA_x.append(c_x) contactsA_y.append(c_y) scoresA.append(score) contactsB_x.append(c_x) contactsB_y.append(c_y) scoresB.append(score) # if min_score == -1.0 and count >= ref_len * factor: # break # if score < min_score: # break assert (len(contacts_x) == len(contacts_y) == len(scores)) assert (len(contactsA_x) == len(contactsA_y) == len(scoresA)) assert (len(contactsB_x) == len(contactsB_y) == len(scoresB)) assert (len(contactsI_x) == len(contactsI_y) == len(scoresI)) cb_lstA = parse_pdb.get_cb_coordinates(open(pdb_filenameA, 'r'), chainA) cb_lstB = parse_pdb.get_cb_coordinates(open(pdb_filenameB, 'r'), chainB) cb_lst = cb_lstA + cb_lstB bfactorA = parse_pdb.get_area(open(pdb_filenameA, 'r'), chainA) bfactorB = parse_pdb.get_area(open(pdb_filenameB, 'r'), chainB) bfactor = bfactorA + bfactorB surfA = parse_pdb.get_dist_to_surface(open(pdb_filenameA, 'r'), chainA) surfB = parse_pdb.get_dist_to_surface(open(pdb_filenameB, 'r'), chainB) surf = surfA + surfB #print cb_lst,noalign if noalign: dist_mat = get_cb_contacts(cb_lst) dist_matA = get_cb_contacts(cb_lstA) dist_matB = get_cb_contacts(cb_lstB) #PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) else: atom_seqA = parse_pdb.get_atom_seq(open(pdb_filenameA, 'r'), chainA) atom_seqB = parse_pdb.get_atom_seq(open(pdb_filenameB, 'r'), chainB) atom_seq = atom_seqA + atom_seqB align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) alignA = pairwise2.align.globalms(atom_seqA, seqA, 2, -1, -0.5, -0.1) alignB = pairwise2.align.globalms(atom_seqB, seqA, 2, -1, -0.5, -0.1) # Align to seq A atom_seq_ali = align[-1][0] seq_ali = align[-1][1] atom_seq_aliA = alignA[-1][0] seq_aliA = alignA[-1][1] atom_seq_aliB = alignB[-1][0] seq_aliB = alignB[-1][1] gapped_cb_lst = [] gapped_cb_lstA = [] gapped_cb_lstB = [] ali_lst = [] ali_lstA = [] ali_lstB = [] j = 0 k = 0 for i in xrange(len(atom_seq_ali)): #print i,j,k,seq_ali[i],atom_seq_ali[i] if atom_seq_ali[i] == '-': gapped_cb_lst.append(['-']) ali_lst.append(-9999) k += 1 elif seq_ali[i] == '-': j += 1 continue else: ali_lst.append(j) gapped_cb_lst.append(cb_lst[j]) k += 1 j += 1 j = 0 k = 0 for i in xrange(len(atom_seq_aliA)): if atom_seq_aliA[i] == '-': gapped_cb_lstA.append(['-']) ali_lstA.append(-9999) k += 1 elif seq_aliA[i] == '-': j += 1 continue else: ali_lstA.append(j) gapped_cb_lstA.append(cb_lstA[j]) k += 1 j += 1 j = 0 k = 0 for i in xrange(len(atom_seq_aliB)): #print "B",i,j,k,seq_aliB[i],atom_seq_aliB[i] if atom_seq_aliB[i] == '-': gapped_cb_lstB.append(['-']) ali_lstB.append(-9999) k += 1 elif seq_aliB[i] == '-': j += 1 continue else: ali_lstB.append(j) gapped_cb_lstB.append(cb_lstB[j]) k += 1 j += 1 #print len(gapped_cb_lst),len(gapped_cb_lstA),len(gapped_cb_lstB) dist_mat = get_cb_contacts(gapped_cb_lst) dist_matA = get_cb_contacts(gapped_cb_lstA) dist_matB = get_cb_contacts(gapped_cb_lstB) cb_cutoff = 8 #ref_contact_map = dist_mat < cb_cutoff # This routine adds all interface and B chain contacts contacts_x, contacts_y, scores = get_interface_contacts( contacts_x, contacts_y, scores, dist_mat, ref_lenA, factor, cb_cutoff + 4, atom_seq_ali=atom_seq_ali) ref_contact_map = dist_mat < cb_cutoff ref_contact_mapA = dist_matA < cb_cutoff ref_contact_mapB = dist_matB < cb_cutoff # Here we need to append if print_dist: print_distances(contacts_x, contacts_y, scores, dist_mat, bfactor, surf, ref_lenA, ref_lenB, seq, ali_lst=ali_lst, atom_seq=atom_seq, outfile=outfilename) Zscore = get_Zscore(contacts_x, contacts_y, ref_contact_map, scores, atom_seq_ali=atom_seq_ali) ZscoreA = get_Zscore(contactsA_x, contactsA_y, ref_contact_mapA, scoresA, atom_seq_ali=atom_seq_aliA) ZscoreB = get_Zscore(contactsB_x, contactsB_y, ref_contact_mapB, scoresB, atom_seq_ali=atom_seq_aliB) ZscoreI = get_Zscore_interface(contacts_x, contacts_y, ref_contact_map, ref_lenA, ref_lenB, scores, atom_seq_ali=atom_seq_ali) PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali) PPVa, TPa, FPa = get_ppv_helper(contactsA_x, contactsA_y, ref_contact_mapA, interfacelen, factor, atom_seq_ali=atom_seq_aliA) PPVb, TPb, FPb = get_ppv_helper(contactsB_x, contactsB_y, ref_contact_mapB, interfacelen, factor, atom_seq_ali=atom_seq_aliB) PPVi, TPi, FPi, PPViE, TPiE, FPiE = get_ppv_helper_interface( contacts_x, contacts_y, ref_contact_map, bfactor, ref_lenA, ref_lenB, interfacelen, cutoff, atom_seq_ali=atom_seq_ali) #for i in range(10): # print "I: ",i,contactsI_x[i],contactsI_y[i],scoresI[i],dist_mat[contactsI_x[i]][contactsI_y[i]],ref_contact_map[contactsI_x[i]][contactsI_y[i]] # print "A: ",i,contactsA_x[i],contactsA_y[i],scoresA[i],dist_mat[contactsA_x[i]][contactsA_y[i]],ref_contact_map[contactsA_x[i]][contactsA_y[i]] # print "B: ",i,contactsB_x[i],contactsB_y[i],scoresB[i],dist_mat[contactsB_x[i]][contactsB_y[i]],ref_contact_map[contactsB_x[i]][contactsB_y[i]] if name: print '%s %s %s %s %s' % (name, PPVa, TPa, FPa, ZscoreA) print '%s %s %s %s %s' % (name, PPVb, TPb, FPb, ZscoreB) print '%s %s %s %s %s' % ("BOTH", PPV, TP, FP, Zscore) print '%s %s %s %s %s' % ("Interface", PPVi, TPi, FPi, ZscoreI) print '%s %s %s %s' % ("Interface-Exposed", PPViE, TPiE, FPiE) else: print '%s %s %s %s %s %s' % (fasta_filenameA, c_filename, PPVa, TPa, FPa, ZscoreA) print '%s %s %s %s %s %s' % (fasta_filenameB, c_filename, PPVb, TPb, FPb, ZscoreB) print '%s %s %s %s %s %s' % ("BOTH", c_filename, PPV, TP, FP, Zscore) print '%s %s %s %s %s %s' % ("Interface", c_filename, PPVi, TPi, FPi, ZscoreI) print '%s %s %s %s %s' % ("Interface-Exposed", c_filename, PPViE, TPiE, FPiE) print 'PPV %s %s %s %s %s %s' % (c_filename, PPV, PPVa, PPVb, PPVi, PPViE) print 'Zscore %s %s %s %s %s' % (c_filename, Zscore, ZscoreA, ZscoreB, ZscoreI) return (pdb_filenameA, PPV, TP, FP)
import sys sys.path.append("/home/mircomic/toolbox") from parsing import parse_contacts # command line input infile_name = sys.argv[1] # guessing separator of constraint file test_line = open(infile_name,'r').readline() if len(test_line.split(',')) != 1: sep = ',' elif len(test_line.split(' ')) != 1: sep = ' ' else: sep = '\t' # parse constraint file c_list = parse_contacts.parse(open(infile_name, 'r')) # sort contacts and write simple string for c in c_list: print ('%s %s %s' % (c[1], c[2], c[0]))
def get_dist(fasta_filename, c_filename, pdb_filename, chain='', sep='', outfilename='', noalign=False, dist_type='CB'): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=5) contacts_x = [] contacts_y = [] scores = [] count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) ca_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain) if noalign: if dist_type == 'CB': dist_mat = get_dist_mat(cb_lst) elif dist_type == 'CA': dist_mat = get_dist_mat(ca_lst) else: dist_mat = get_dist_mat_heavy(res_lst) contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat) else: atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] gapped_ca_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') gapped_ca_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) gapped_ca_lst.append(ca_lst[j]) j += 1 assert (len(gapped_ca_lst) == len(gapped_cb_lst) == len(gapped_res_lst)) if dist_type == 'CB': dist_mat = get_dist_mat(gapped_cb_lst) elif dist_type == 'CA': dist_mat = get_dist_mat(gapped_ca_lst) else: dist_mat = get_dist_mat_heavy(gapped_res_lst) contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat, atom_seq_ali=atom_seq_ali) assert (len(contacts_dist) == len(contacts_x) == len(contacts_y) == len(scores)) num_c = len(contacts_dist) if outfilename: with open(outfilename, 'w') as outfile: for i in xrange(num_c): outfile.write('%s %s %f %f\n' % (contacts_x[i], contacts_y[i], scores[i], contacts_dist[i])) return (contacts_x, contacts_y, scores, contacts_dist)
from parsing import parse_fasta from parsing import parse_contacts if len(sys.argv) != 5: sys.stderr.write("Incorrect number of command line arguments.\n") sys.stderr.write("Usage: " + sys.argv[0] + " <sequence file> <contact file> <CASP target ID> <output filename>\n\n") sys.exit(0) sfile = sys.argv[1] cfile = sys.argv[2] target = sys.argv[3] seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0] contacts = parse_contacts.parse(open(cfile), min_dist=0) print len(contacts) print contacts[0] print seq ofile = open(sys.argv[4], "w") ofile.write( "PFRMAT RR\nTARGET %s\nAUTHOR 6685-2065-9124\nMETHOD Pcons-net\nREMARK PconsC2\nMETHOD Improved contact predictions using the\nMETHOD recognition of protein like contact\nMETHOD patterns.\nMODEL 1\n" % target ) tmp_i = 1 for aa in seq: ofile.write(aa)
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''): acc = fasta_filename.split('.')[0][:4] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break ### start plotting fig = plt.figure() ax = fig.add_subplot(111) ### plot secondary structure on the diagonal if given if psipred_horiz_fname or psipred_vert_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) else: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) assert len(ss) == ref_len for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print '%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1]) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC') ### plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break ### use TP/FP color coding if reference contacts given if pdb_filename: PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print '%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1]) fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5) ### plot predicted contacts from first contact map on both triangles ### if no second contact map given else: if pdb_filename: fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) plt.colorbar(sc) plt.gca().set_xlim([0,ref_len]) plt.gca().set_ylim([0,ref_len]) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() elif outfilename.endswith(('.png', '.jpg', '.jpeg')): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close()
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_filename='', pdb_filename='', is_heavy=False, chain='', sep='', sep2='', outfilename=''): acc = c_filename.split('.')[0] ### get sequence #seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] #ref_len = len(seq) ### get id f = open(fasta_filename,"rU") seqs = SeqIO.parse(f,"fasta") # we assume there is only one record for record in seqs: seq = str(record.seq) protein_id = record.id ref_len = len(seq) # guessing separator of constraint file if sep == '': line = open(c_filename,'r').readline() if len(line.split(',')) != 1: sep = ',' elif len(line.split(' ')) != 1: sep = ' ' else: sep = '\t' ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break ### start plotting fig = plt.figure() plt.title('Contact map for ' + protein_id) ax = fig.add_subplot(111) ### plot secondary structure on the diagonal if given if psipred_filename: ss = parse_psipred.horizontal(open(psipred_filename, 'r')) for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) if (len(res_lst)==0) or (len(cb_lst)==0): print "Could not parse the PDB file, res_list or cb_list is empty" return try: atom_seq_ali = align[-1][0] seq_ali = align[-1][1] except Exception,ex: print "Could not parse the PDB file:", ex return j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print '%s\t%s' % (acc, PPVs[-1]) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC')
### plot predicted contacts from second contact map if given if c2_filename: # guessing separator of constraint file if sep2 == '': line = open(c_filename,'r').readline() if len(line.split(',')) != 1: sep2 = ',' elif len(line.split(' ')) != 1: sep2 = ' ' else: sep2 = '\t' contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep2) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5