def get_ppv_hbond(fasta_filename, c_filename, hbond_filename, factor=1.0, min_score=-1.0, sep=' ', outfilename=''): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if min_score == -1.0 and count >= ref_len * factor: break if score < min_score: break ref_contact_map = np.zeros((ref_len, ref_len)) hbonds_raw = open(hbond_filename).readlines() hbonds = [line.strip().split(' ') for line in hbonds_raw] #map(split(' '), map(strip, hbonds_raw)) for h in hbonds: i = int(h[0]) - 1 j = int(h[1]) - 1 val = float(h[2]) ref_contact_map[i, j] = -val ref_contact_map[j, i] = -val PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) print '%s %s %s %s' % (hbond_filename, PPV, TP, FP) return (hbond_filename, PPV, TP, FP)
def realign(fasta_filename, pdb_filename, outfilename='', chain='*'): ### get sequence seq = list(parse_fasta.read_fasta(open(fasta_filename, 'r')).values())[0][0] ref_len = len(seq) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) pdbfile = open(pdb_filename, 'r') align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] #print (atom_seq_ali,seq_ali) res_i = -9999 resno = {} i = 0 atompos = 0 seqpos = 0 maxlen = len(atom_seq_ali) for i in range(0, maxlen): if atom_seq_ali[i] == "-": seqpos += 1 elif seq_ali[i] == "-": atompos += 1 resno[atompos] = -9999 else: atompos += 1 seqpos += 1 resno[atompos] = seqpos if not chain: chain = get_first_chain(pdbfile) pdbfile.seek(0) i = 0 for line in pdbfile: if not line.startswith('ATOM'): continue atm_record = parse_pdb.parse_atm_record(line) if atm_record['chain'] != ' ' and atm_record[ 'chain'] != chain and chain != '*': continue if atm_record['res_no'] != res_i: i += 1 res_i = atm_record['res_no'] atm_record['res_no'] = resno[i] #print (atm_record) if resno[i] > 0: parse_pdb.write_pdb_atm_record(atm_record) #res_dict[res_i].append(np.array(atm)) #pdbfile.close() return
def plot_map(fasta_filename, c_filename, factor, c2_filename='', ss_fname='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''): acc = fasta_filename.split('.')[0][:4] # get sequence seq = list(parse_fasta.read_fasta(open(fasta_filename, 'r')).values())[0][0] ref_len = len(seq) # get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] # contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 # Also checking here, # should remove in parse # too_close = False if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break # start plotting fig = plt.figure() ax = fig.add_subplot(111) # plot secondary structure on the diagonal if given if psipred_horiz_fname or psipred_vert_fname or ss_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) elif psipred_vert_fname: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) else: ss = parse_ss.parse(open(ss_fname, 'r')) assert len(ss) == ref_len for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue # plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in range(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print('%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1])) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC') # plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] # contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break # use TP/FP color coding if reference contacts given if pdb_filename: PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print('%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1])) fig.suptitle('%s\nPPV (upper left) = %.2f |' % (PPVs[-1]) + 'PPV (lower right) = %.2f' % (PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5) # plot predicted contacts from first contact map on both triangles # if no second contact map given else: if pdb_filename: fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) plt.colorbar(sc) plt.gca().set_xlim([0, ref_len]) plt.gca().set_ylim([0, ref_len]) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() elif outfilename.endswith(('.png', '.jpg', '.jpeg')): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close() plt.show()
def get_ppv( fasta_filename, contact_filename, pdb_filename, factor_value, cb_cutoff, min_score, chain1, chain2, outfilename, name, noalign, min_dist, print_dist ): """ Return a tupla of 1 str and 3 floats, (pdb_filename, PPV, TP, FP). """ # From a dictionary builded from a file, could be a simple fasta, # a3m, or a MSA, get first seq. The dictionary has this structure: # {header:[seq1,seq2,...,seqN], header2:[seq1,seq2,...,seqN]} <--- Is it correct? # The KEYs are the query IDs. print("In get_ppv function") seq = list(parse_fasta.read_fasta(fasta_filename).values())[0][0] ref_len = len(seq) # Get all the scores from contacts that satisfies the arguments contacts_x, contacts_y, scores = get_scores_from_contacts(contact_filename, min_dist, factor_value, min_score, ref_len) # Create a carbon-beta list from chain1 like this: # [array_cb1([x1, y1, z1]), array_cb2([x2, y2, z2], ... array_cbN([xN, yN, zN]) cb_chain1_lst = parse_pdb.get_cb_coordinates(pdb_filename, chain1) ########################################## # Using Biopython to get the coordinates # ########################################## # from Bio.PDB.PDBParser import PDBParser # parser = PDBParser(PERMISSIVE=1) # See 11.7 Common problems in PDB files --> http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc150 # structure_id = "1bih" # filename = "<PATH-TO-PDB-FILE>XXXX.pdb" # structure = parser.get_structure(structure_id, filename) # # for model in structure.get_list(): # for chain in model.get_list(): # if chain == model["B"]: # for residue in chain.get_list(): # if residue.has_id("CB"): # cb = residue["CB"] # print(cb.get_coord()) # #[46.114 29.797 48.287] # Here "noalign" is always setting FALSE in function definition. Why? if noalign: dist_mat = get_cb_contacts(cb_chain1_lst) # cb_cutoff = 8 # Check if those are less than "cb_cutoff" angstrom far from each other. ref_contact_map = dist_mat < cb_cutoff # cb_cutoff is 8 in default mode # Get all the PPV, TP and FP results. PPV, TP, FP = get_ppv_helper( contacts_x, contacts_y, ref_contact_map, atom_seq_ali=[] ) else: # Check if PPI is need it if (chain2 != "" and chain2 != chain1): print("In the PPI branch of get_ppv") # Create a carbon-beta list from chain2 cb_chain2_lst = parse_pdb.get_cb_coordinates(pdb_filename, chain2) gapped_cb_chain1_lst = get_gapped_cb_lts(pdb_filename, chain1, seq, cb_chain1_lst) gapped_cb_chain2_lst = get_gapped_cb_lts(pdb_filename, chain2, seq, cb_chain2_lst) # Get the distance matrix from chain1 only. # I do not use this distance matrix but could # be useful if intra-chain is also need it to be printed. # dist_mat_chain1 = get_cb_contacts(gapped_cb_chain1_lst) # Get the distance matrix from chain1 vs chain2. Used in PPI dist_mat_chain1_vs_chain2 = get_cb_contacts_PPI(gapped_cb_chain1_lst, gapped_cb_chain2_lst) # cb_cutoff = 8 # Check if those are less than "cb_cutoff" angstrom far from each other. # This create a boolean matrix called: ref_contact_map # cb_cutoff is 8 in default mode ref_contact_map = dist_mat_chain1_vs_chain2 < cb_cutoff # Get atoms seq aligned from a PDB_chain. # atom_seq_ali it is a string. atom_seq_ali_chain1 = get_global_align_from_pdb(pdb_filename, chain1, seq)[-1][0] atom_seq_ali_chain2 = get_global_align_from_pdb(pdb_filename, chain2, seq)[-1][0] ################################################### ## Which atom_seq_ali_chainX we have to use for ## ## get_ppf_helper() ? Both? or that one with ## <----- LOOK ! ## the best aligned pdb sequence to fasta_seq ? ## ################################################### print("Getting PPV, TP and FP values...") # Get all the PPV, TP and FP results. PPV, TP, FP = get_ppv_helper( contacts_x, contacts_y, ref_contact_map, atom_seq_ali_chain1 ) # Check if print is need it. if print_dist: print("Printing PPI's distance results...") print_distances_PPI( contacts_x, contacts_y, scores, dist_mat_chain1_vs_chain2, atom_seq_ali_chain1, atom_seq_ali_chain2, outfilename ) else: print("In the monomer branch of get_ppv") gapped_cb_chain1_lst = get_gapped_cb_lts(pdb_filename, chain1, seq, cb_chain1_lst) # Get the distance matrix from chain1 only. Could be useful if # intra-chain is also need it to be printed. dist_mat = get_cb_contacts(gapped_cb_chain1_lst) # cb_cutoff = 8 # Check if those are less than "cb_cutoff" angstrom far from each other. # This create a boolean matrix called: ref_contact_map # cb_cutoff is 8 in default mode. ref_contact_map = dist_mat < cb_cutoff # Get atoms seq aligned from a PDB_chain. # atom_seq_ali it is a string. atom_seq_ali_chain1 = get_global_align_from_pdb(pdb_filename, chain1, seq)[-1][0] # Get the PPV, TP and FP results. PPV, TP, FP = get_ppv_helper( contacts_x, contacts_y, ref_contact_map, atom_seq_ali_chain1 ) # Check if print is need it. if print_dist: print_distances( contacts_x, contacts_y, scores, dist_mat, atom_seq_ali_chain1, outfilename ) # Here "name" is always empty (so False) by default. if name: print("%s\n" % ("----------------------------------")) print("%s %s %s %s" % (name, PPV, TP, FP)) else: print("Finished") # print("%s %s %s %s %s" % (fasta_filename, contact_filename, PPV, TP, FP)) return (pdb_filename, PPV, TP, FP)
#!/usr/bin/env python import sys sys.path.append("/home/x_arnel/git/bioinfo-toolbox/") from parsing import parse_fasta from parsing import parse_contacts sfile = sys.argv[1] cfile = sys.argv[2] target = sys.argv[3] server = sys.argv[4] ofilepath = sys.argv[5] minsep = sys.argv[6] minscore = sys.argv[7] seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0] contacts = parse_contacts.parse(open(cfile), min_dist=0) print len(contacts) print contacts[0] print seq ofile = open(ofilepath, 'w') if server == "Pcons-net": ofile.write( "PFRMAT RR\nTARGET %s\nAUTHOR 5450-4562-0389\nMETHOD Pcons-net\nREMARK PconsC3\nMETHOD Improved contact predictions on\nMETHOD small protein families.\nMODEL 1\n" % target) elif server == "PconsC2": ofile.write(
import sys import os import shutil import errno import stat from parsing import parse_fasta if __name__ == '__main__': infile = open(sys.argv[1], 'r') seq_dict = parse_fasta.read_fasta(infile) for header, seq_lst in seq_dict.iteritems(): acc = header.split()[0] outfile = open(acc + '.fa', 'w') seq = seq_lst[0] fasta_string = '>%s\n%s\n' % (header, seq) #print(fasta_string) outfile.write(fasta_string) outfile.close() infile.close()
def get_ppv(fasta_filename, c_filename, pdb_filename, factor=1.0, min_score=-1.0, chain='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, print_dist=False): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top ranked predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < min_dist if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if min_score == -1.0 and count >= ref_len * factor: break if score < min_score: break assert(len(contacts_x) == len(contacts_y) == len(scores)) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) bfactor = parse_pdb.get_area(open(pdb_filename, 'r'), chain) surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain) if noalign: dist_mat = get_cb_contacts(cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) else: atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] gapped_cb_lst = [] ali_lst =[] j = 0 k = 0 for i in xrange(len(atom_seq_ali)): #print i,j,k,seq_ali[i],atom_seq_ali[i] if atom_seq_ali[i] == '-': gapped_cb_lst.append(['-']) ali_lst.append(-9999) k += 1 elif seq_ali[i] == '-': j += 1 continue else: ali_lst.append(j) gapped_cb_lst.append(cb_lst[j]) k += 1 j += 1 dist_mat = get_cb_contacts(gapped_cb_lst) area = parse_pdb.get_area(open(pdb_filename, 'r'), chain) surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain) if print_dist: print_distances(contacts_x, contacts_y, scores, dist_mat, area, surf, ref_len,ref_len, seq, ali_lst=ali_lst, atom_seq=atom_seq, outfile=outfilename) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali) if name: print '%s %s %s %s' % (name, PPV, TP, FP) else: print '%s %s %s %s %s' % (fasta_filename, c_filename, PPV, TP, FP) return (pdb_filename, PPV, TP, FP)
def get_ppv(fasta_filenameA, c_filename, pdb_filenameA, fasta_filenameB, pdb_filenameB, factor=1.0, min_score=-1.0, chainA='', chainB='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, interfacelen=10, print_dist=False, cutoff=0.25): ### get sequence seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0] seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0] seq = seqA + seqA # Actually the contact map sequence is just two copies of seqA ref_lenA = len(seqA) ref_lenB = len(seqB) ref_len = len(seq) ### get top ranked predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist) contacts_x = [] contacts_y = [] scores = [] contactsA_x = [] contactsA_y = [] scoresA = [] contactsB_x = [] contactsB_y = [] scoresB = [] contactsI_x = [] contactsI_y = [] scoresI = [] contact_dict = {} count = 0 countA = 0 countB = 0 countI = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 #print i,c_x,c_y,score pos_diff = abs(c_x - c_y) too_close = pos_diff < min_dist if not too_close: # The contacts only covers contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) #contacts_x.append(c_x+ref_lenA) #contacts_y.append(c_y+ref_lenA) #scores.append(score) contactsA_x.append(c_x) contactsA_y.append(c_y) scoresA.append(score) contactsB_x.append(c_x) contactsB_y.append(c_y) scoresB.append(score) # if min_score == -1.0 and count >= ref_len * factor: # break # if score < min_score: # break assert (len(contacts_x) == len(contacts_y) == len(scores)) assert (len(contactsA_x) == len(contactsA_y) == len(scoresA)) assert (len(contactsB_x) == len(contactsB_y) == len(scoresB)) assert (len(contactsI_x) == len(contactsI_y) == len(scoresI)) cb_lstA = parse_pdb.get_cb_coordinates(open(pdb_filenameA, 'r'), chainA) cb_lstB = parse_pdb.get_cb_coordinates(open(pdb_filenameB, 'r'), chainB) cb_lst = cb_lstA + cb_lstB bfactorA = parse_pdb.get_area(open(pdb_filenameA, 'r'), chainA) bfactorB = parse_pdb.get_area(open(pdb_filenameB, 'r'), chainB) bfactor = bfactorA + bfactorB surfA = parse_pdb.get_dist_to_surface(open(pdb_filenameA, 'r'), chainA) surfB = parse_pdb.get_dist_to_surface(open(pdb_filenameB, 'r'), chainB) surf = surfA + surfB #print cb_lst,noalign if noalign: dist_mat = get_cb_contacts(cb_lst) dist_matA = get_cb_contacts(cb_lstA) dist_matB = get_cb_contacts(cb_lstB) #PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor) else: atom_seqA = parse_pdb.get_atom_seq(open(pdb_filenameA, 'r'), chainA) atom_seqB = parse_pdb.get_atom_seq(open(pdb_filenameB, 'r'), chainB) atom_seq = atom_seqA + atom_seqB align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) alignA = pairwise2.align.globalms(atom_seqA, seqA, 2, -1, -0.5, -0.1) alignB = pairwise2.align.globalms(atom_seqB, seqA, 2, -1, -0.5, -0.1) # Align to seq A atom_seq_ali = align[-1][0] seq_ali = align[-1][1] atom_seq_aliA = alignA[-1][0] seq_aliA = alignA[-1][1] atom_seq_aliB = alignB[-1][0] seq_aliB = alignB[-1][1] gapped_cb_lst = [] gapped_cb_lstA = [] gapped_cb_lstB = [] ali_lst = [] ali_lstA = [] ali_lstB = [] j = 0 k = 0 for i in xrange(len(atom_seq_ali)): #print i,j,k,seq_ali[i],atom_seq_ali[i] if atom_seq_ali[i] == '-': gapped_cb_lst.append(['-']) ali_lst.append(-9999) k += 1 elif seq_ali[i] == '-': j += 1 continue else: ali_lst.append(j) gapped_cb_lst.append(cb_lst[j]) k += 1 j += 1 j = 0 k = 0 for i in xrange(len(atom_seq_aliA)): if atom_seq_aliA[i] == '-': gapped_cb_lstA.append(['-']) ali_lstA.append(-9999) k += 1 elif seq_aliA[i] == '-': j += 1 continue else: ali_lstA.append(j) gapped_cb_lstA.append(cb_lstA[j]) k += 1 j += 1 j = 0 k = 0 for i in xrange(len(atom_seq_aliB)): #print "B",i,j,k,seq_aliB[i],atom_seq_aliB[i] if atom_seq_aliB[i] == '-': gapped_cb_lstB.append(['-']) ali_lstB.append(-9999) k += 1 elif seq_aliB[i] == '-': j += 1 continue else: ali_lstB.append(j) gapped_cb_lstB.append(cb_lstB[j]) k += 1 j += 1 #print len(gapped_cb_lst),len(gapped_cb_lstA),len(gapped_cb_lstB) dist_mat = get_cb_contacts(gapped_cb_lst) dist_matA = get_cb_contacts(gapped_cb_lstA) dist_matB = get_cb_contacts(gapped_cb_lstB) cb_cutoff = 8 #ref_contact_map = dist_mat < cb_cutoff # This routine adds all interface and B chain contacts contacts_x, contacts_y, scores = get_interface_contacts( contacts_x, contacts_y, scores, dist_mat, ref_lenA, factor, cb_cutoff + 4, atom_seq_ali=atom_seq_ali) ref_contact_map = dist_mat < cb_cutoff ref_contact_mapA = dist_matA < cb_cutoff ref_contact_mapB = dist_matB < cb_cutoff # Here we need to append if print_dist: print_distances(contacts_x, contacts_y, scores, dist_mat, bfactor, surf, ref_lenA, ref_lenB, seq, ali_lst=ali_lst, atom_seq=atom_seq, outfile=outfilename) Zscore = get_Zscore(contacts_x, contacts_y, ref_contact_map, scores, atom_seq_ali=atom_seq_ali) ZscoreA = get_Zscore(contactsA_x, contactsA_y, ref_contact_mapA, scoresA, atom_seq_ali=atom_seq_aliA) ZscoreB = get_Zscore(contactsB_x, contactsB_y, ref_contact_mapB, scoresB, atom_seq_ali=atom_seq_aliB) ZscoreI = get_Zscore_interface(contacts_x, contacts_y, ref_contact_map, ref_lenA, ref_lenB, scores, atom_seq_ali=atom_seq_ali) PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali) PPVa, TPa, FPa = get_ppv_helper(contactsA_x, contactsA_y, ref_contact_mapA, interfacelen, factor, atom_seq_ali=atom_seq_aliA) PPVb, TPb, FPb = get_ppv_helper(contactsB_x, contactsB_y, ref_contact_mapB, interfacelen, factor, atom_seq_ali=atom_seq_aliB) PPVi, TPi, FPi, PPViE, TPiE, FPiE = get_ppv_helper_interface( contacts_x, contacts_y, ref_contact_map, bfactor, ref_lenA, ref_lenB, interfacelen, cutoff, atom_seq_ali=atom_seq_ali) #for i in range(10): # print "I: ",i,contactsI_x[i],contactsI_y[i],scoresI[i],dist_mat[contactsI_x[i]][contactsI_y[i]],ref_contact_map[contactsI_x[i]][contactsI_y[i]] # print "A: ",i,contactsA_x[i],contactsA_y[i],scoresA[i],dist_mat[contactsA_x[i]][contactsA_y[i]],ref_contact_map[contactsA_x[i]][contactsA_y[i]] # print "B: ",i,contactsB_x[i],contactsB_y[i],scoresB[i],dist_mat[contactsB_x[i]][contactsB_y[i]],ref_contact_map[contactsB_x[i]][contactsB_y[i]] if name: print '%s %s %s %s %s' % (name, PPVa, TPa, FPa, ZscoreA) print '%s %s %s %s %s' % (name, PPVb, TPb, FPb, ZscoreB) print '%s %s %s %s %s' % ("BOTH", PPV, TP, FP, Zscore) print '%s %s %s %s %s' % ("Interface", PPVi, TPi, FPi, ZscoreI) print '%s %s %s %s' % ("Interface-Exposed", PPViE, TPiE, FPiE) else: print '%s %s %s %s %s %s' % (fasta_filenameA, c_filename, PPVa, TPa, FPa, ZscoreA) print '%s %s %s %s %s %s' % (fasta_filenameB, c_filename, PPVb, TPb, FPb, ZscoreB) print '%s %s %s %s %s %s' % ("BOTH", c_filename, PPV, TP, FP, Zscore) print '%s %s %s %s %s %s' % ("Interface", c_filename, PPVi, TPi, FPi, ZscoreI) print '%s %s %s %s %s' % ("Interface-Exposed", c_filename, PPViE, TPiE, FPiE) print 'PPV %s %s %s %s %s %s' % (c_filename, PPV, PPVa, PPVb, PPVi, PPViE) print 'Zscore %s %s %s %s %s' % (c_filename, Zscore, ZscoreA, ZscoreB, ZscoreI) return (pdb_filenameA, PPV, TP, FP)
import sys import argparse import Bio.PDB from Bio import pairwise2 from os.path import expanduser home = expanduser("~") sys.path.append(home + '/git/bioinfo-toolbox') from parsing import parse_contacts from parsing import parse_fasta from parsing import parse_pdb if __name__ == "__main__": p = argparse.ArgumentParser(description='Get sequence identity from two fasta files.') p.add_argument('fasta_fileA') p.add_argument('fasta_fileB') args = vars(p.parse_args(sys.argv[1:])) fasta_filenameA = args['fasta_fileA'] fasta_filenameB = args['fasta_fileB'] seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0] seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0] align = pairwise2.align.localms(seqA,seqB , 1, -1, -0.5, -0.1) minlen=len(seqA) if len(seqB)<minlen: minlen=len(seqB) # print "Identity: ",fasta_filenameA,fasta_filenameB,float(align[0][2])/float(align[0][4]-align[0][3]) print "Identity: ",fasta_filenameA,fasta_filenameB,float(align[0][2])/float(minlen)
def get_dist(fasta_filename, c_filename, pdb_filename, chain='', sep='', outfilename='', noalign=False, dist_type='CB'): acc = fasta_filename.split('.')[-2][-5:-1] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=5) contacts_x = [] contacts_y = [] scores = [] count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) ca_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain) if noalign: if dist_type == 'CB': dist_mat = get_dist_mat(cb_lst) elif dist_type == 'CA': dist_mat = get_dist_mat(ca_lst) else: dist_mat = get_dist_mat_heavy(res_lst) contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat) else: atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] gapped_ca_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') gapped_ca_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) gapped_ca_lst.append(ca_lst[j]) j += 1 assert (len(gapped_ca_lst) == len(gapped_cb_lst) == len(gapped_res_lst)) if dist_type == 'CB': dist_mat = get_dist_mat(gapped_cb_lst) elif dist_type == 'CA': dist_mat = get_dist_mat(gapped_ca_lst) else: dist_mat = get_dist_mat_heavy(gapped_res_lst) contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat, atom_seq_ali=atom_seq_ali) assert (len(contacts_dist) == len(contacts_x) == len(contacts_y) == len(scores)) num_c = len(contacts_dist) if outfilename: with open(outfilename, 'w') as outfile: for i in xrange(num_c): outfile.write('%s %s %f %f\n' % (contacts_x[i], contacts_y[i], scores[i], contacts_dist[i])) return (contacts_x, contacts_y, scores, contacts_dist)
sys.path.append("/scratch/mirco_local/bioinfo-toolbox") from parsing import parse_fasta from parsing import parse_contacts if len(sys.argv) != 5: sys.stderr.write("Incorrect number of command line arguments.\n") sys.stderr.write("Usage: " + sys.argv[0] + " <sequence file> <contact file> <CASP target ID> <output filename>\n\n") sys.exit(0) sfile = sys.argv[1] cfile = sys.argv[2] target = sys.argv[3] seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0] contacts = parse_contacts.parse(open(cfile), min_dist=0) print len(contacts) print contacts[0] print seq ofile = open(sys.argv[4], "w") ofile.write( "PFRMAT RR\nTARGET %s\nAUTHOR 6685-2065-9124\nMETHOD Pcons-net\nREMARK PconsC2\nMETHOD Improved contact predictions using the\nMETHOD recognition of protein like contact\nMETHOD patterns.\nMODEL 1\n" % target ) tmp_i = 1
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''): acc = fasta_filename.split('.')[0][:4] ### get sequence seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0] ref_len = len(seq) ### get top "factor" * "ref_len" predicted contacts contacts = parse_contacts.parse(open(c_filename, 'r'), sep) contacts_x = [] contacts_y = [] scores = [] contact_dict = {} count = 0 for i in range(len(contacts)): score = contacts[i][0] c_x = contacts[i][1] - 1 c_y = contacts[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts_x.append(c_x) contacts_y.append(c_y) scores.append(score) count += 1 if count >= ref_len * factor: break ### start plotting fig = plt.figure() ax = fig.add_subplot(111) ### plot secondary structure on the diagonal if given if psipred_horiz_fname or psipred_vert_fname: if psipred_horiz_fname: ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r')) else: ss = parse_psipred.vertical(open(psipred_vert_fname, 'r')) assert len(ss) == ref_len for i in range(len(ss)): if ss[i] == 'H': plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2) if ss[i] == 'E': plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2) if ss[i] == 'C': continue ### plot reference contacts in the background if given if pdb_filename: res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain) cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain) atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain) align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1) atom_seq_ali = align[-1][0] seq_ali = align[-1][1] j = 0 gapped_res_lst = [] gapped_cb_lst = [] for i in xrange(len(atom_seq_ali)): if atom_seq_ali[i] == '-': gapped_res_lst.append('-') gapped_cb_lst.append('-') elif seq_ali[i] == '-': j += 1 continue else: gapped_res_lst.append(res_lst[j]) gapped_cb_lst.append(cb_lst[j]) j += 1 if is_heavy: dist_mat = get_heavy_contacts(gapped_res_lst) heavy_cutoff = 5 ref_contact_map = dist_mat < heavy_cutoff ref_contacts = np.where(dist_mat < heavy_cutoff) else: dist_mat = get_cb_contacts(gapped_cb_lst) cb_cutoff = 8 ref_contact_map = dist_mat < cb_cutoff ref_contacts = np.where(dist_mat < cb_cutoff) ref_contacts_x = ref_contacts[0] ref_contacts_y = ref_contacts[1] PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali) print '%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1]) ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC') ### plot predicted contacts from second contact map if given if c2_filename: contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep) contacts2_x = [] contacts2_y = [] scores2 = [] contact_dict2 = {} count = 0 for i in range(len(contacts2)): score = contacts2[i][0] c_x = contacts2[i][1] - 1 c_y = contacts2[i][2] - 1 pos_diff = abs(c_x - c_y) too_close = pos_diff < 5 if not too_close: contacts2_x.append(c_x) contacts2_y.append(c_y) scores2.append(score) count += 1 if count >= ref_len * factor: break ### use TP/FP color coding if reference contacts given if pdb_filename: PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor) tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali) print '%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1]) fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1])) sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5) ### plot predicted contacts from first contact map on both triangles ### if no second contact map given else: if pdb_filename: fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1])) sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0) else: sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1) plt.colorbar(sc) plt.gca().set_xlim([0,ref_len]) plt.gca().set_ylim([0,ref_len]) if outfilename: if outfilename.endswith('.pdf'): pp = PdfPages(outfilename) pp.savefig(fig) pp.close() elif outfilename.endswith(('.png', '.jpg', '.jpeg')): plt.savefig(outfilename) else: pp = PdfPages('%s.pdf' % outfilename) pp.savefig(fig) pp.close() else: pp = PdfPages('%s_ContactMap.pdf' % c_filename) pp.savefig(fig) pp.close()