import sys sys.path.append('/home/tc/svn/tc_sandbox/pdb') import parse_mmCIF, mmCIF2coords sys.path.append('/home/tc/svn/GoodVibes') import NMA, visualization d_mmCIF = parse_mmCIF.main('2lzm',) d_coords, l_coords_alpha = mmCIF2coords.main('2lzm',d_mmCIF) cutoff = 10 matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian,) visualization.vmd_trajectory('2lzm',l_coords_alpha,eigenvectors)
def parse_GoodVibes_exclude_flexible( pdb, path, ): ## ## calculate amplitudes ## d_mmCIF = parse_mmCIF.main(pdb[:4], ) d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4], d_mmCIF, query_chain=pdb[-1]) print len(l_coords_alpha) ## ## eigenvector ## cutoff = 10 matrix_hessian = NMA.hessian_calculation( l_coords_alpha, cutoff, ) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian) l_amplitudes = [ math.sqrt(eigenvectors[6][i]**2 + eigenvectors[6][i + 1]**2 + eigenvectors[6][i + 2]**2) for i in range(0, len(eigenvectors[6]), 3) ] ## ## write pdb (color by bfactor) ## l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))] ## fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r') ## lines = fd.readlines() ## fd.close() ## index = [-1,None,] ## lines_out = [] ## for line in lines: ## record = line[:6].strip() ## if record != 'ATOM': ## lines_out += [line] ## else: ## res_no = int(line[22:26]) ## if res_no != index[1]: ## index = [index[0]+1,res_no,] ## bfactor = l_bfactors[index[0]] ## line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],) ## lines_out += [line_out] ## fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w') ## fd.writelines(lines_out) ## fd.close() ## average amplitude average = sum(l_amplitudes) / len(l_amplitudes) average, stddev = statistics.do_stddev(l_amplitudes) ## l_coords_rigid = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] < average: l_coords_rigid += [l_coords_alpha[i]] l_coords_flexible = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] > average + 0.5 * stddev: l_coords_flexible += [l_coords_alpha[i]] ## parse output fd = open('output/%s/%s_%s_probe.pdb' % ( path, pdb[:4], pdb[-1], ), 'r') lines = fd.readlines() fd.close() max_bfactor = None coord = None for line in lines: record = line[:6].strip() if record not in [ 'ATOM', 'HETATM', ]: continue res_name = line[17:20] if res_name != 'EXT': continue bfactor = float(line[60:66]) if bfactor > max_bfactor: x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) ## coord_tmp = numpy.array([x,y,z,]) ## bool_vicinal_to_rigid = False ## for coord_rigid in l_coords_rigid: ## dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2)) ## if dist_from_rigid < 6: ## bool_vicinal_to_rigid = True ## break ## if bool_vicinal_to_rigid == False: ## continue ## bool_vicinal_to_flexible = False ## for coord_flexible in l_coords_flexible: ## dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2)) ## if dist_from_flexible < 6: ## bool_vicinal_to_flexible = True ## break ## if bool_vicinal_to_flexible == True: ## continue ## min_dist = [1000.,None,] ## for i_coord_alpha in range(len(l_coords_alpha)): ## coord_alpha = l_coords_alpha[i_coord_alpha] ## dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2)) ## if dist_from_alpha < min_dist[0]: ## min_dist = [dist_from_alpha,i_coord_alpha,] ## if l_amplitudes[min_dist[1]] > average+stddev: ## continue coord = numpy.array([ x, y, z, ]) max_bfactor = bfactor return coord
import sys sys.path.append('/home/tc/svn/tc_sandbox/pdb') import parse_mmCIF, mmCIF2coords sys.path.append('/home/tc/svn/GoodVibes') import NMA, visualization d_mmCIF = parse_mmCIF.main('2lzm', ) d_coords, l_coords_alpha = mmCIF2coords.main('2lzm', d_mmCIF) cutoff = 10 matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, ) visualization.vmd_trajectory('2lzm', l_coords_alpha, eigenvectors)
'1czfA', '1thgA', '1booA', '1iu4A', '1bqcA', '206lA', '1cdeA', '1snzA', '1gq8A', '1aqlA', '1ps1A', '1s95A', '1pylA', '1ra2A', '1b6bA', '1pntA', '1e1aA', '2f9rA', '1v04A', '2nlrA', '1n29A', '1pbgA', '5cpaA', '1agmA', '1byaA', '1r76A', '1u5uA', '1vidA', '1h4gA', '1akdA', '1fy2A', '1xqdA', '1d6oA', '1qv0A', '1qjeA', '1fvaA', '1bp2A', '1ah7A', '2pthA', '2engA', '2acyA', '1qazA', '2a0nA', '1dl2A', '1gp5A', '1onrA', '1cwyA', '1pudA', '1bs9A', '1dinA', '1xyzA', '1bwlA', '1eugA', '1idjA', '1g24A', '1oygA', '1hzfA', '9papA', '1eb6A', '1ghsA', '1rbnA', '1bixA', '1bs4A', '1celA', '1hkaA', '1b02A', '1qibA', '1u3fA', '1agyA', '1zioA', '1pa9A', '2tpsA', '2plcA', '1qk2A', '1j53A', '1m21A', ] cutoff = 10 for pdb in l_pdbs: pdb = pdb[:4] d = parse_mmCIF.main(pdb,) d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain = pdb[4:]) matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose = False) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, verbose = False) visualization.vmd_arrows(pdb, l_coords, eigenvectors) print pdb stop
def main(): set_pdbs = exclude_include() l_pdbs_remove = [ '4a3h','2wf5','1arl','1ee3', ## incorrect _struct_ref_seq.pdbx_db_accession '1uyd','1uye','1uyf','2byh','2byi', ## remediation _struct_ref_seq_dif '2xdu','3dn8','3dna','1ps3','1ouf','1l35','2eun','1rtc','1zon', ## _struct_ref_seq_dif missing '1pwl','1pwm','2fz8','2fz9', ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code ] set_pdbs.remove('1f92') ## remediation _struct_ref_seq_dif incorrect residue number set_pdbs.remove('2f6f') ## remediation _pdbx_poly_seq_scheme.auth_mon_id wrong set_pdbs.remove('3a5j') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be MET set_pdbs.remove('2rhx') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be SER set_pdbs.remove('2fzb') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('2fzd') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('3dn5') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1x96') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1x97') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1x98') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1z3n') ## GenBank DBref - not an error... set_pdbs.remove('1z8a') ## GenBank DBref - not an error... set_pdbs.remove('1z89') ## GenBank DBref - not an error... set_pdbs.remove('2pf8') ## stupid use of alt_ids (C for highest occupancy and only altloc) set_pdbs.remove('2pyr') ## stupid use of alt_ids (G and R) set_pdbs.remove('3pdn') ## stupid use of alt_ids (B and C) set_pdbs.remove('2v4c') ## alt_id B used for 100% occupancy atoms set_pdbs.remove('1jxt') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxu') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxw') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxx') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxy') ## weird alt_id microheterogeneity... ## set_pdbs.remove('1ac4') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... ## set_pdbs.remove('1ac8') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... ## set_pdbs.remove('1aeb') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... ## set_pdbs.remove('2rbt') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789 ## set_pdbs.remove('2rbu') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789 ## set_pdbs.remove('2rbv') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789 for pdb in l_pdbs_remove: set_pdbs.remove(pdb) fd = open('%s/bc-100.out' %(path_mmCIF),'r') lines = fd.readlines() fd.close() for i_line in range(len(lines)): cluster = i_line if cluster < 4816: continue ## if cluster not in [5,]: ## continue line = lines[i_line] l_pdbs = line.lower().split() l_pdbs.sort() for i_pdb in range(len(l_pdbs)): l_pdbs[i_pdb] = l_pdbs[i_pdb][:4] for i_pdb1 in range(0,len(l_pdbs)-1): pdb1 = l_pdbs[i_pdb1] ## if pdb1 != '1t49': ## tmp!!! ## continue if not pdb1 in set_pdbs: continue print pdb1 stop d_mmCIF1 = parse_mmCIF.main(pdb1,) bool_monomeric = check_monomeric(d_mmCIF1) if bool_monomeric == False: if i_pdb1 == 0: break else: continue bool_remediation_modres = check_modres(d_mmCIF1,pdb1,) if bool_remediation_modres == True: continue if '_struct_ref_seq_dif.details' in d_mmCIF1.keys(): if 'DELETION' in d_mmCIF1['_struct_ref_seq_dif.details']: continue for i_entity in range(len(d_mmCIF1['_entity.id'])): if d_mmCIF1['_entity.type'][i_entity] == 'polymer': if int(d_mmCIF1['_entity.pdbx_number_of_molecules'][i_entity]) != 1: print d_mmCIF1['_entity.pdbx_number_of_molecules'] print pdb1, cluster stop SG1 = d_mmCIF1['_symmetry.space_group_name_H-M'] for i_pdb2 in range(i_pdb1+1,len(l_pdbs)): pdb2 = l_pdbs[i_pdb2] ## if pdb2 != '2pf8': ## tmp!!! ## continue ## if pdb1 != '3fui' or pdb2 != '3fuj': ## continue if not pdb2 in set_pdbs: continue d_mmCIF2 = parse_mmCIF.main(pdb2,) bool_monomeric = check_monomeric(d_mmCIF2) if bool_monomeric == False: continue bool_remediation_modres = check_modres(d_mmCIF2,pdb2,) if bool_remediation_modres == True: continue if '_struct_ref_seq_dif.seq_num' in d_mmCIF2.keys(): if 'DELETION' in d_mmCIF2['_struct_ref_seq_dif.details']: continue ## biounit monomeric? for i_entity in range(len(d_mmCIF2['_entity.id'])): if d_mmCIF2['_entity.type'][i_entity] == 'polymer': if int(d_mmCIF2['_entity.pdbx_number_of_molecules'][i_entity]) != 1: continue SG2 = d_mmCIF2['_symmetry.space_group_name_H-M'] if SG1 != SG2: continue ## parse coordinates again after being shortened in previous loop try: d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1) except: fd = open('remediation_atom_site.label_alt_id.txt','a') fd.write('%s\n' %(pdb1,)) fd.close() try: d_coords2, l_coords_alpha2 = mmCIF2coords.main(pdb2, d_mmCIF2) except: fd = open('remediation_atom_site.label_alt_id.txt','a') fd.write('%s\n' %(pdb2,)) fd.close() ## align sequences/coordinates try: l_coords_alpha1, l_coords_alpha2 = create_apo_holo_dataset.sequential_alignment_of_coordinates( l_coords_alpha1, l_coords_alpha2, d_mmCIF1, d_mmCIF2, pdb1, pdb2, ) except: fd = open('remediation_struct_ref_seq_dif.txt','a') fd.write( '%s %s %s %s\n' %( pdb1,pdb2, d_mmCIF1['_struct_ref_seq.pdbx_db_accession'], d_mmCIF2['_struct_ref_seq.pdbx_db_accession'], ) ) fd.close() continue if len(l_coords_alpha1) != len(l_coords_alpha2): print d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id'] print d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id'] print 'coords', len(l_coords_alpha1), len(l_coords_alpha2) print 'seq', len(d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id']) print 'seq', len(d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id']) print pdb1, pdb2 d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1) d_coords1, l_coords_alpha2 = mmCIF2coords.main(pdb1, d_mmCIF2) print len(l_coords_alpha1), len(l_coords_alpha2) stop continue ## ## align structure 1 and 2 ## instance_geometry = geometry.geometry() rmsd = instance_geometry.superpose(l_coords_alpha1,l_coords_alpha2) tv1 = instance_geometry.fitcenter rm = instance_geometry.rotation tv2 = instance_geometry.refcenter ## structural alignment for i_coord in range(len(l_coords_alpha2)): l_coords_alpha2[i_coord] = numpy.dot(l_coords_alpha2[i_coord]-tv1,rm)+tv2 ## ## vector from structure 1 to 2 ## vector = [] for i in range(len(l_coords_alpha1)): vector += [ l_coords_alpha1[i][0]-l_coords_alpha2[i][0], l_coords_alpha1[i][1]-l_coords_alpha2[i][1], l_coords_alpha1[i][2]-l_coords_alpha2[i][2], ] vector = numpy.array(vector) ## ## calculate normal modes of structure 1 ## cutoff = 10 try: matrix_hessian1 = NMA.hessian_calculation(l_coords_alpha1, cutoff, verbose = False) eigenvectors1, eigenvalues1 = NMA.diagonalize_hessian(matrix_hessian1, verbose = False) matrix_hessian2 = NMA.hessian_calculation(l_coords_alpha2, cutoff, verbose = False) eigenvectors2, eigenvalues2 = NMA.diagonalize_hessian(matrix_hessian2, verbose = False) except: continue ## ## calculate overlap between normal modes and difference vector ## eigenvector1 = eigenvectors1[6] eigenvector2 = eigenvectors2[6] overlap1 = calc_overlap(eigenvector1,vector) overlap2 = calc_overlap(eigenvector2,vector) overlap3a = calc_overlap(eigenvector1,eigenvector2) overlap3b = calc_overlap(eigenvectors1[6],eigenvectors2[7]) overlap3c = calc_overlap(eigenvectors1[7],eigenvectors2[6]) overlap3 = max(overlap3a,overlap3b,overlap3c) fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap1)) fd.close() fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap2)) fd.close() fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap3a)) fd.close() fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev_max.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap3)) fd.close() print pdb1, pdb2, 'cluster', i_line, 'size', len(l_pdbs), print 'overlap', '%4.2f' %(round(overlap1,2)), '%4.2f' %(round(overlap2,2)), '%4.2f' %(round(overlap3,2)), 'rmsd', '%4.2f' %(round(rmsd,2)) return
def parse_GoodVibes_exclude_flexible(pdb,path,): ## ## calculate amplitudes ## d_mmCIF = parse_mmCIF.main(pdb[:4],) d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],d_mmCIF,query_chain=pdb[-1]) print len(l_coords_alpha) ## ## eigenvector ## cutoff = 10 matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian) l_amplitudes = [ math.sqrt( eigenvectors[6][i]**2+eigenvectors[6][i+1]**2+eigenvectors[6][i+2]**2 ) for i in range(0,len(eigenvectors[6]),3) ] ## ## write pdb (color by bfactor) ## l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))] ## fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r') ## lines = fd.readlines() ## fd.close() ## index = [-1,None,] ## lines_out = [] ## for line in lines: ## record = line[:6].strip() ## if record != 'ATOM': ## lines_out += [line] ## else: ## res_no = int(line[22:26]) ## if res_no != index[1]: ## index = [index[0]+1,res_no,] ## bfactor = l_bfactors[index[0]] ## line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],) ## lines_out += [line_out] ## fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w') ## fd.writelines(lines_out) ## fd.close() ## average amplitude average = sum(l_amplitudes)/len(l_amplitudes) average,stddev = statistics.do_stddev(l_amplitudes) ## l_coords_rigid = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] < average: l_coords_rigid += [l_coords_alpha[i]] l_coords_flexible = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] > average+0.5*stddev: l_coords_flexible += [l_coords_alpha[i]] ## parse output fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r') lines = fd.readlines() fd.close() max_bfactor = None coord = None for line in lines: record = line[:6].strip() if record not in ['ATOM','HETATM',]: continue res_name = line[17:20] if res_name != 'EXT': continue bfactor = float(line[60:66]) if bfactor > max_bfactor: x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) ## coord_tmp = numpy.array([x,y,z,]) ## bool_vicinal_to_rigid = False ## for coord_rigid in l_coords_rigid: ## dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2)) ## if dist_from_rigid < 6: ## bool_vicinal_to_rigid = True ## break ## if bool_vicinal_to_rigid == False: ## continue ## bool_vicinal_to_flexible = False ## for coord_flexible in l_coords_flexible: ## dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2)) ## if dist_from_flexible < 6: ## bool_vicinal_to_flexible = True ## break ## if bool_vicinal_to_flexible == True: ## continue ## min_dist = [1000.,None,] ## for i_coord_alpha in range(len(l_coords_alpha)): ## coord_alpha = l_coords_alpha[i_coord_alpha] ## dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2)) ## if dist_from_alpha < min_dist[0]: ## min_dist = [dist_from_alpha,i_coord_alpha,] ## if l_amplitudes[min_dist[1]] > average+stddev: ## continue coord = numpy.array([x,y,z,]) max_bfactor = bfactor return coord
'1u3fA', '1agyA', '1zioA', '1pa9A', '2tpsA', '2plcA', '1qk2A', '1j53A', '1m21A', ] cutoff = 10 for pdb in l_pdbs: pdb = pdb[:4] d = parse_mmCIF.main(pdb, ) d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain=pdb[4:]) matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose=False) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, verbose=False) visualization.vmd_arrows(pdb, l_coords, eigenvectors) print pdb stop