def main(): d_MV = {} path = '/data/mmCIF' l_dn = os.listdir(path) l_dn.sort() for dn in l_dn: if dn == 'mmCIF.py': continue if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue l_fn = os.listdir('%s/%s' % (path, dn)) for fn in l_fn: pdb = fn[:4] ## if pdb.upper() not in s_pdbs: ## continue d_mmCIF = parse_mmCIF.main( pdb, d_breaks={'_exptl.method': 'SOLUTION NMR'}, l_data_categories=[ '_cell', '_entity', '_exptl', '_exptl_crystal', '_entity_poly', '_symmetry', ## virus '_pdbx_struct_assembly', ## split structure '_pdbx_database_related', ], ) ## x-ray structure if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue ## polymer present if not '_entity_poly.type' in d_mmCIF.keys(): continue ## only polymer present is protein if d_mmCIF['_entity_poly.type'] != len( d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']: continue if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys(): continue if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len( d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['?']: continue ## virus if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] [0]) % 60 == 0: continue ## not monomer if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len( d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['1']: continue ## split structure if '_pdbx_database_related' in d_mmCIF.keys(): if 'split' in d_mmCIF['_pdbx_database_related']: continue if 'SPLIT' in d_mmCIF['_pdbx_database_related']: print pdb stop if not '_cell.Z_PDB' in d_mmCIF.keys(): continue if pdb in [ ## treshold '1e54', '1e9i', ## difference between calculated MV and MV in mmCIF '3eiq', ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1 ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01. ## Toscana has published with Hellinga... '2cjf', '2bt4', ]: continue ## if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [ ## 'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3', ## ]: ## continue ## tmp!!! a = float(d_mmCIF['_cell.length_a'][0]) b = float(d_mmCIF['_cell.length_b'][0]) c = float(d_mmCIF['_cell.length_c'][0]) alpha = float(d_mmCIF['_cell.angle_alpha'][0]) beta = float(d_mmCIF['_cell.angle_beta'][0]) gamma = float(d_mmCIF['_cell.angle_gamma'][0]) Z = int(d_mmCIF['_cell.Z_PDB'][0]) mw = 0 for i in range(len(d_mmCIF['_entity.id'])): ## if d_mmCIF['_entity.type'][i] == 'polymer': s = d_mmCIF['_entity.formula_weight'][i] ## unknown ligand if s == '?': continue mw += float(s) MV = matthews_coefficient.main(a, b, c, alpha, beta, gamma, mw, Z) spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) if spacegroup not in [ 'F 4 3 2', 'F 41 3 2', 'I 41 3 2', ]: continue ## tmp!!! if MV > 10: print pdb print 'mw', mw print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews'] print 'Z', Z import math alpha *= math.pi / 180. beta *= math.pi / 180. gamma *= math.pi / 180. V = a * b * c * math.sqrt( 1 - math.cos(alpha)**2 - math.cos(beta)**2 - math.cos(gamma)**2 + 2 * (math.cos(alpha) * math.cos(beta) * math.cos(gamma))) print 'V', V continue stop_treshold stop if '_exptl_crystal.density_Matthews' in d_mmCIF.keys(): if d_mmCIF['_exptl_crystal.density_Matthews'] not in [ ['?'], len(d_mmCIF['_exptl_crystal.density_Matthews']) * ['?'], ]: if abs(MV - float(d_mmCIF['_exptl_crystal.density_Matthews'][0]) ) > 1: print 'MV', MV print 'MV', d_mmCIF['_exptl_crystal.density_Matthews'] print 'mw', mw print 'Z', Z continue stop_difference if not spacegroup in d_MV.keys(): d_MV[spacegroup] = [] d_MV[spacegroup] += [MV] print pdb, round(MV, 2), spacegroup ## fd = open('MV_v_spacegroup.txt','w') ## fd.write(str(d_MV)) ## fd.close() l = ['# MV_average MV_stddev n spacegroup\n'] for spacegroup in d_MV.keys(): l_MV = d_MV[spacegroup] if len(l_MV) <= 1: continue average, stddev = statistics.do_stddev(l_MV) average, stderr = statistics.do_stderr(l_MV) ## l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)] l += ['%s %s %s %s\n' % ( average, stderr, len(l_MV), spacegroup, )] fd = open('MV_v_spacegroup.txt', 'w') fd.writelines(l) fd.close() return
def main(): d_units = {'CA':'{\305}','heavy':'\305','phipsi':'/Symbol \260','chi1':'/Symbol \260'} ## octals d_octals = {'CA':'C_{/Symbol a}','heavy':'heavy atom','phipsi':'{/Symbol f}{/Symbol y}','chi1':'{/Symbol c}_1'} ## octals l_radii = [10,20,40,] d_columns = {'CA':9,'heavy':13,'phipsi':17,'chi1':21,} l_distances = [0,10,20,40,] ## suffix = 'sameSG_sameauth' suffix = 'sameSG' d_rmsds = {} for radius in l_radii: d_rmsds[radius] = {} for key in d_columns.keys(): d_rmsds[radius][key] = {} for dist_from_mut in range(0,83+1): d_rmsds[radius][key][dist_from_mut] = [] ## the function sphere in quakes.py wrote this file when using the -singlemutants flag fn = 'sphere/out_%s_allresidues.txt' %(suffix) fd = open(fn,'r') lines = fd.readlines() fd.close() print 'read', fn print 'looping over lines' for line in lines: l = line.split() ## exclude HEWL and T4L if l[0].upper() in s_exclude or l[1].upper() in s_exclude: continue dist_from_mut = int(float(l[8])) for key, col in d_columns.items(): for i_radii in range(len(l_radii)): radius = l_radii[i_radii] rmsd = l[col+i_radii] if rmsd == 'None': continue else: d_rmsds[radius][key][dist_from_mut] += [float(rmsd)] print 'looped over lines' ## ## histogram ## lines = [] for dist_from_mut in d_rmsds[10]['CA'].keys(): lines += ['%s %s\n' %(dist_from_mut, len(d_rmsds[10]['CA'][dist_from_mut]))] fd = open('histogram.gnuplotdata','w') fd.writelines(lines) fd.close() ## plot_histogram('histogram',suffix,'distance (\305) from mutation','count of RMSDs',) plot_histogram('histogram',suffix,'distance ({\305}) from mutation','count of RMSDs',) ## ## histograms ## ## divide 1Angstrom into smaller units/tics ## d_divs = {'CA':1000.,'heavy':500.,'phipsi':50.,'chi1':20.} d_divs = {'CA':500.,'heavy':500.,'phipsi':50.,'chi1':20.} for key in d_columns.keys(): print 'temporarily dont plot histograms' break xlabel = '%s RMSD (%s)' %(d_octals[key],d_units[key]) div = d_divs[key] l_plot_files = [] for radius in l_radii: d_l_rmsds_histogram = {'all':[],} for dist in l_distances: d_l_rmsds_histogram[dist] = [] ## l_rmsds = [] ## organize data for dist_from_mut in d_rmsds[radius][key].keys(): for dist in l_distances: if dist_from_mut <= dist: d_l_rmsds_histogram[dist] += d_rmsds[radius][key][dist_from_mut] break ## l_rmsds += d_rmsds[radius][key][dist_from_mut] d_l_rmsds_histogram['all'] += d_rmsds[radius][key][dist_from_mut] ## count (yvalues) d_count = {'all':{},} l_range = [] for dist in l_distances: d_count[dist] = {} for dist in d_count.keys(): ## ??? for rmsd in range(int(div*min(d_l_rmsds_histogram[dist])),int(div*max(d_l_rmsds_histogram[dist]))+1): d_count[dist][rmsd] = 0 for rmsd in d_l_rmsds_histogram[dist]: d_count[dist][int(div*rmsd)] += 1 for rmsd,count in d_count[dist].items(): if count > 10: l_range += [rmsd] ## xtics d_xtics = {} for rmsd in range( int(min(l_range)/div), int(max(l_range)/div)+1, ): d_xtics['%s' %(rmsd)] = rmsd ## convert dict to txt for dist in d_count.keys(): lines = [] for rmsd in range( int(min(l_range)), int(max(l_range))+1, ): if not rmsd in d_count[dist].keys(): count = 0 else: count = d_count[dist][rmsd] lines += ['%s %s\n' %(rmsd/div, count)] fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' %(key, dist, radius) fd = open(fn,'w') fd.writelines(lines) fd.close() ## plot_histogram(prefix,suffix,'RMSD (Angstrom)','count of RMSDs',d_xtics=d_xtics) for dist in d_count.keys(): l_plot_files = [] for radius in l_radii: fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' %(key, dist, radius) l_plot_files += [fn] prefix = 'histogram_%s_distfrommut%s' %(key,dist,) if dist == 'all': title = 'all distances from site of mutation' else: title = '%s {\305} from site of mutation' %(dist) plot_histogram(prefix,suffix,xlabel,'count of RMSDs',l_plot_files=l_plot_files,title=title) for key in d_columns.keys(): l_plot_files = [] ## ## ## for key in d_columns.keys(): ## xlabel = 'distance {Symbol \305} from mutation' xlabel = 'distance ({\305}) from mutation' ylabel = 'RMSD ({Symbol %s}) within sphere' %(d_units[key],) title = '%s' %(d_octals[key]) for i_radii in range(len(l_radii)): radius = l_radii[i_radii] lines = [] for dist_from_mut in d_rmsds[radius][key].keys(): l_rmsds = d_rmsds[radius][key][dist_from_mut] ## if len(l_rmsds) == 0: ## if len(l_rmsds) < 100: ## if len(l_rmsds) < 400: if len(l_rmsds) < 1000: if dist_from_mut < 10: print 'skipping', dist_from_mut, key, radius continue average,stderr = statistics.do_stderr(l_rmsds) lines += ['%s %s %s\n' %(dist_from_mut+0.1*i_radii,average,stderr)] fd = open('%s_%s.gnuplotdata' %(key,radius),'w') fd.writelines(lines) fd.close() plot_scatter(key,xlabel,ylabel,l_radii,title=title,) plot_scatter_combined(suffix) return
def main(): import os d_bfactors1 = init_bfactor_dic() path = '/oxygenase_local/data/pdb' path = '/media/WDMyBook1TB/2TB/pdb' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): continue dn = l_dns[i] ## if dn != 'e7': ## continue ## if dn < 'xk': ## continue print '%s/%s %s' % (i + 1, len(l_dns), dn) l_fns = os.listdir('%s/%s' % (path, dn)) for fn in l_fns: continue pdb = fn[3:7] if pdb in [ '2wto', '2wtp', ]: ## remediation continue l_bfactors = [] d_bfactors2 = init_bfactor_dic() d_ss = {} fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() for line in lines: record = line[:6].strip() if record == 'EXPDTA': if line[10:13] == 'NMR': break elif line[10:22] == 'SOLUTION NMR': break elif line[10:25] == 'SOLID-STATE NMR': break elif line[ 10: 41] == 'X-RAY DIFFRACTION; SOLUTION NMR': ## 1iob break elif line[ 10: 43] == 'SOLUTION SCATTERING; SOLUTION NMR': ## 2klj break elif line[10:29] == 'ELECTRON MICROSCOPY': break elif line[10:27] != 'X-RAY DIFFRACTION': print line break if 'NMR' in line: print fn print line stop elif record == 'HEADER': year = int(line[50:59][-2:]) elif record == 'HELIX': if line[19] != line[31]: print line print pdb stop chain = line[19] res_no1 = int(line[21:25]) iCode1 = line[25] res_no2 = int(line[33:37]) iCode2 = line[37] d_ss = SSrecord2dic(record, chain, res_no1, res_no2, iCode1, iCode2, d_ss) elif record == 'SHEET': if line[21] != line[32]: print line print pdb stop chain = line[21] res_no1 = int(line[22:26]) iCode1 = line[26] res_no2 = int(line[33:37]) iCode2 = line[37] d_ss = SSrecord2dic(record, chain, res_no1, res_no2, iCode1, iCode2, d_ss) elif record == 'TURN': stop if line[19] != line[30]: stop chain = line[19] res_no1 = int(line[20:24]) iCode1 = line[24] res_no2 = int(line[31:35]) iCode2 = line[35] d_ss = SSrecord2dic(record, chain, res_no1, res_no2, iCode1, iCode2, d_ss) elif record == 'ATOM': res_name = line[17:20] ## amino acid residues only if res_name not in d_bfactors2.keys(): continue atom_name = line[12:16].strip() ## backbone atoms only if atom_name not in [ 'N', 'CA', 'C', 'O', ]: continue chain = line[21] res_no = int(line[22:26]) iCode = line[26] bfactor = float(line[60:66]) l_bfactors += [bfactor] ss = ATOMrecord2SS( chain, res_no, iCode, d_ss, ) d_bfactors2[res_name][ss][atom_name] += [bfactor] ## no amino acids (or break) if len(l_bfactors) == 0: continue average = sum(l_bfactors) / len(l_bfactors) ## NMR structure if average == 0: continue ## identical temperature factors for all atoms if len(l_bfactors) * [average] == l_bfactors: print 'all bfactors same', pdb, 'year', year, 'bfac', average continue if average in [ 2, 3, 4, 5, 6, 7, 8, 9, 99, 90, 50, 20, 25, 1, 100, 10, 0 ]: print average print fn stop for res_name in d_bfactors2.keys(): for ss in d_bfactors2[res_name].keys(): for atom_name in d_bfactors2[res_name][ss].keys(): l_bfactors = d_bfactors2[res_name][ss][atom_name] if len(l_bfactors) == 0: continue lines_out = [ '%s\n' % (bfac / average) for bfac in l_bfactors ] fd = open( 'bfac_%s_%s_%s.txt' % (res_name, ss, atom_name), 'a') fd.writelines(lines_out) fd.close() ## for bfactor in d_bfactors2[res_name][ss][atom_name]: ## bfactor_normalized = bfactor/average ## d_bfactors1[res_name][ss][atom_name] += [bfactor_normalized] for res_name in d_bfactors1.keys(): l_bfactors_res = [] for ss in [ 'HELIX', 'SHEET', 'OTHER', ]: for atom_name in d_bfactors1[res_name][ss].keys(): fd = open('bfac_%s_%s_%s.txt' % (res_name, ss, atom_name), 'r') lines = fd.readlines() fd.close() l_bfactors = [float(line) for line in lines] l_bfactors_res += l_bfactors ## average, stderr = statistics.do_stderr(l_bfactors) ## print '%s\t%s\t%s\t%s\t%s\t%s' %( ## res_name, ss, atom_name, ## len(l_bfactors), ## sum(l_bfactors)/len(l_bfactors), ## stderr, ## ) average, stderr = statistics.do_stderr(l_bfactors_res) print '%s\t%s\t%s\t%s' % ( res_name, len(l_bfactors_res), sum(l_bfactors_res) / len(l_bfactors_res), stderr, )
def main(): d_units = { 'CA': '{\305}', 'heavy': '\305', 'phipsi': '/Symbol \260', 'chi1': '/Symbol \260' } ## octals d_octals = { 'CA': 'C_{/Symbol a}', 'heavy': 'heavy atom', 'phipsi': '{/Symbol f}{/Symbol y}', 'chi1': '{/Symbol c}_1' } ## octals l_radii = [ 10, 20, 40, ] d_columns = { 'CA': 9, 'heavy': 13, 'phipsi': 17, 'chi1': 21, } l_distances = [ 0, 10, 20, 40, ] ## suffix = 'sameSG_sameauth' suffix = 'sameSG' d_rmsds = {} for radius in l_radii: d_rmsds[radius] = {} for key in d_columns.keys(): d_rmsds[radius][key] = {} for dist_from_mut in range(0, 83 + 1): d_rmsds[radius][key][dist_from_mut] = [] ## the function sphere in quakes.py wrote this file when using the -singlemutants flag fn = 'sphere/out_%s_allresidues.txt' % (suffix) fd = open(fn, 'r') lines = fd.readlines() fd.close() print 'read', fn print 'looping over lines' for line in lines: l = line.split() ## exclude HEWL and T4L if l[0].upper() in s_exclude or l[1].upper() in s_exclude: continue dist_from_mut = int(float(l[8])) for key, col in d_columns.items(): for i_radii in range(len(l_radii)): radius = l_radii[i_radii] rmsd = l[col + i_radii] if rmsd == 'None': continue else: d_rmsds[radius][key][dist_from_mut] += [float(rmsd)] print 'looped over lines' ## ## histogram ## lines = [] for dist_from_mut in d_rmsds[10]['CA'].keys(): lines += [ '%s %s\n' % (dist_from_mut, len(d_rmsds[10]['CA'][dist_from_mut])) ] fd = open('histogram.gnuplotdata', 'w') fd.writelines(lines) fd.close() ## plot_histogram('histogram',suffix,'distance (\305) from mutation','count of RMSDs',) plot_histogram( 'histogram', suffix, 'distance ({\305}) from mutation', 'count of RMSDs', ) ## ## histograms ## ## divide 1Angstrom into smaller units/tics ## d_divs = {'CA':1000.,'heavy':500.,'phipsi':50.,'chi1':20.} d_divs = {'CA': 500., 'heavy': 500., 'phipsi': 50., 'chi1': 20.} for key in d_columns.keys(): print 'temporarily dont plot histograms' break xlabel = '%s RMSD (%s)' % (d_octals[key], d_units[key]) div = d_divs[key] l_plot_files = [] for radius in l_radii: d_l_rmsds_histogram = { 'all': [], } for dist in l_distances: d_l_rmsds_histogram[dist] = [] ## l_rmsds = [] ## organize data for dist_from_mut in d_rmsds[radius][key].keys(): for dist in l_distances: if dist_from_mut <= dist: d_l_rmsds_histogram[dist] += d_rmsds[radius][key][ dist_from_mut] break ## l_rmsds += d_rmsds[radius][key][dist_from_mut] d_l_rmsds_histogram['all'] += d_rmsds[radius][key][ dist_from_mut] ## count (yvalues) d_count = { 'all': {}, } l_range = [] for dist in l_distances: d_count[dist] = {} for dist in d_count.keys(): ## ??? for rmsd in range( int(div * min(d_l_rmsds_histogram[dist])), int(div * max(d_l_rmsds_histogram[dist])) + 1): d_count[dist][rmsd] = 0 for rmsd in d_l_rmsds_histogram[dist]: d_count[dist][int(div * rmsd)] += 1 for rmsd, count in d_count[dist].items(): if count > 10: l_range += [rmsd] ## xtics d_xtics = {} for rmsd in range( int(min(l_range) / div), int(max(l_range) / div) + 1, ): d_xtics['%s' % (rmsd)] = rmsd ## convert dict to txt for dist in d_count.keys(): lines = [] for rmsd in range( int(min(l_range)), int(max(l_range)) + 1, ): if not rmsd in d_count[dist].keys(): count = 0 else: count = d_count[dist][rmsd] lines += ['%s %s\n' % (rmsd / div, count)] fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' % ( key, dist, radius) fd = open(fn, 'w') fd.writelines(lines) fd.close() ## plot_histogram(prefix,suffix,'RMSD (Angstrom)','count of RMSDs',d_xtics=d_xtics) for dist in d_count.keys(): l_plot_files = [] for radius in l_radii: fn = 'histogram_%s_distfrommut%s_radius%s.gnuplotdata' % ( key, dist, radius) l_plot_files += [fn] prefix = 'histogram_%s_distfrommut%s' % ( key, dist, ) if dist == 'all': title = 'all distances from site of mutation' else: title = '%s {\305} from site of mutation' % (dist) plot_histogram(prefix, suffix, xlabel, 'count of RMSDs', l_plot_files=l_plot_files, title=title) for key in d_columns.keys(): l_plot_files = [] ## ## ## for key in d_columns.keys(): ## xlabel = 'distance {Symbol \305} from mutation' xlabel = 'distance ({\305}) from mutation' ylabel = 'RMSD ({Symbol %s}) within sphere' % (d_units[key], ) title = '%s' % (d_octals[key]) for i_radii in range(len(l_radii)): radius = l_radii[i_radii] lines = [] for dist_from_mut in d_rmsds[radius][key].keys(): l_rmsds = d_rmsds[radius][key][dist_from_mut] ## if len(l_rmsds) == 0: ## if len(l_rmsds) < 100: ## if len(l_rmsds) < 400: if len(l_rmsds) < 1000: if dist_from_mut < 10: print 'skipping', dist_from_mut, key, radius continue average, stderr = statistics.do_stderr(l_rmsds) lines += [ '%s %s %s\n' % (dist_from_mut + 0.1 * i_radii, average, stderr) ] fd = open('%s_%s.gnuplotdata' % (key, radius), 'w') fd.writelines(lines) fd.close() plot_scatter( key, xlabel, ylabel, l_radii, title=title, ) plot_scatter_combined(suffix) return
def main(): import os d_bfactors1 = init_bfactor_dic() path = '/oxygenase_local/data/pdb' path = '/media/WDMyBook1TB/2TB/pdb' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): continue dn = l_dns[i] ## if dn != 'e7': ## continue ## if dn < 'xk': ## continue print '%s/%s %s' %(i+1,len(l_dns), dn) l_fns = os.listdir('%s/%s' %(path,dn)) for fn in l_fns: continue pdb = fn[3:7] if pdb in ['2wto','2wtp',]: ## remediation continue l_bfactors = [] d_bfactors2 = init_bfactor_dic() d_ss = {} fd = open('%s/%s/%s' %(path, dn, fn), 'r') lines = fd.readlines() fd.close() for line in lines: record = line[:6].strip() if record == 'EXPDTA': if line[10:13] == 'NMR': break elif line[10:22] == 'SOLUTION NMR': break elif line[10:25] == 'SOLID-STATE NMR': break elif line[10:41] == 'X-RAY DIFFRACTION; SOLUTION NMR': ## 1iob break elif line[10:43] == 'SOLUTION SCATTERING; SOLUTION NMR': ## 2klj break elif line[10:29] == 'ELECTRON MICROSCOPY': break elif line[10:27] != 'X-RAY DIFFRACTION': print line break if 'NMR' in line: print fn print line stop elif record == 'HEADER': year = int(line[50:59][-2:]) elif record == 'HELIX': if line[19] != line[31]: print line print pdb stop chain = line[19] res_no1 = int(line[21:25]) iCode1 = line[25] res_no2 = int(line[33:37]) iCode2 = line[37] d_ss = SSrecord2dic(record,chain,res_no1,res_no2,iCode1,iCode2,d_ss) elif record == 'SHEET': if line[21] != line[32]: print line print pdb stop chain = line[21] res_no1 = int(line[22:26]) iCode1 = line[26] res_no2 = int(line[33:37]) iCode2 = line[37] d_ss = SSrecord2dic(record,chain,res_no1,res_no2,iCode1,iCode2,d_ss) elif record == 'TURN': stop if line[19] != line[30]: stop chain = line[19] res_no1 = int(line[20:24]) iCode1 = line[24] res_no2 = int(line[31:35]) iCode2 = line[35] d_ss = SSrecord2dic(record,chain,res_no1,res_no2,iCode1,iCode2,d_ss) elif record == 'ATOM': res_name = line[17:20] ## amino acid residues only if res_name not in d_bfactors2.keys(): continue atom_name = line[12:16].strip() ## backbone atoms only if atom_name not in ['N','CA','C','O',]: continue chain = line[21] res_no = int(line[22:26]) iCode = line[26] bfactor = float(line[60:66]) l_bfactors += [bfactor] ss = ATOMrecord2SS(chain,res_no,iCode,d_ss,) d_bfactors2[res_name][ss][atom_name] += [bfactor] ## no amino acids (or break) if len(l_bfactors) == 0: continue average = sum(l_bfactors)/len(l_bfactors) ## NMR structure if average == 0: continue ## identical temperature factors for all atoms if len(l_bfactors)*[average] == l_bfactors: print 'all bfactors same', pdb, 'year', year, 'bfac', average continue if average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]: print average print fn stop for res_name in d_bfactors2.keys(): for ss in d_bfactors2[res_name].keys(): for atom_name in d_bfactors2[res_name][ss].keys(): l_bfactors = d_bfactors2[res_name][ss][atom_name] if len(l_bfactors) == 0: continue lines_out = ['%s\n' %(bfac/average) for bfac in l_bfactors] fd = open('bfac_%s_%s_%s.txt' %(res_name,ss,atom_name),'a') fd.writelines(lines_out) fd.close() ## for bfactor in d_bfactors2[res_name][ss][atom_name]: ## bfactor_normalized = bfactor/average ## d_bfactors1[res_name][ss][atom_name] += [bfactor_normalized] for res_name in d_bfactors1.keys(): l_bfactors_res = [] for ss in ['HELIX','SHEET','OTHER',]: for atom_name in d_bfactors1[res_name][ss].keys(): fd = open('bfac_%s_%s_%s.txt' %(res_name,ss,atom_name),'r') lines = fd.readlines() fd.close() l_bfactors = [float(line) for line in lines] l_bfactors_res += l_bfactors ## average, stderr = statistics.do_stderr(l_bfactors) ## print '%s\t%s\t%s\t%s\t%s\t%s' %( ## res_name, ss, atom_name, ## len(l_bfactors), ## sum(l_bfactors)/len(l_bfactors), ## stderr, ## ) average, stderr = statistics.do_stderr(l_bfactors_res) print '%s\t%s\t%s\t%s' %( res_name, len(l_bfactors_res), sum(l_bfactors_res)/len(l_bfactors_res), stderr, )
def main(): d_MV = {} path = '/data/mmCIF' l_dn = os.listdir(path) l_dn.sort() for dn in l_dn: if dn == 'mmCIF.py': continue if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue l_fn = os.listdir('%s/%s' %(path,dn)) for fn in l_fn: pdb = fn[:4] ## if pdb.upper() not in s_pdbs: ## continue d_mmCIF = parse_mmCIF.main( pdb, d_breaks = {'_exptl.method':'SOLUTION NMR'}, l_data_categories = [ '_cell','_entity','_exptl','_exptl_crystal', '_entity_poly', '_symmetry', ## virus '_pdbx_struct_assembly', ## split structure '_pdbx_database_related', ], ) ## x-ray structure if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue ## polymer present if not '_entity_poly.type' in d_mmCIF.keys(): continue ## only polymer present is protein if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']: continue if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys(): continue if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['?']: continue ## virus if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'][0]) % 60 == 0: continue ## not monomer if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['1']: continue ## split structure if '_pdbx_database_related' in d_mmCIF.keys(): if 'split' in d_mmCIF['_pdbx_database_related']: continue if 'SPLIT' in d_mmCIF['_pdbx_database_related']: print pdb stop if not '_cell.Z_PDB' in d_mmCIF.keys(): continue if pdb in [ ## treshold '1e54','1e9i', ## difference between calculated MV and MV in mmCIF '3eiq', ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1 ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01. ## Toscana has published with Hellinga... '2cjf','2bt4', ]: continue ## if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [ ## 'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3', ## ]: ## continue ## tmp!!! a = float(d_mmCIF['_cell.length_a'][0]) b = float(d_mmCIF['_cell.length_b'][0]) c = float(d_mmCIF['_cell.length_c'][0]) alpha = float(d_mmCIF['_cell.angle_alpha'][0]) beta = float(d_mmCIF['_cell.angle_beta'][0]) gamma = float(d_mmCIF['_cell.angle_gamma'][0]) Z = int(d_mmCIF['_cell.Z_PDB'][0]) mw = 0 for i in range(len(d_mmCIF['_entity.id'])): ## if d_mmCIF['_entity.type'][i] == 'polymer': s = d_mmCIF['_entity.formula_weight'][i] ## unknown ligand if s == '?': continue mw += float(s) MV = matthews_coefficient.main(a,b,c,alpha,beta,gamma,mw,Z) spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) if spacegroup not in [ 'F 4 3 2', 'F 41 3 2', 'I 41 3 2', ]: continue ## tmp!!! if MV > 10: print pdb print 'mw', mw print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews'] print 'Z', Z import math alpha *= math.pi/180. beta *= math.pi/180. gamma *= math.pi/180. V = a*b*c*math.sqrt(1-math.cos(alpha)**2-math.cos(beta)**2-math.cos(gamma)**2+2*(math.cos(alpha)*math.cos(beta)*math.cos(gamma))) print 'V', V continue stop_treshold stop if '_exptl_crystal.density_Matthews' in d_mmCIF.keys(): if d_mmCIF['_exptl_crystal.density_Matthews'] not in [['?'],len(d_mmCIF['_exptl_crystal.density_Matthews'])*['?'],]: if abs(MV-float(d_mmCIF['_exptl_crystal.density_Matthews'][0])) > 1: print 'MV', MV print 'MV', d_mmCIF['_exptl_crystal.density_Matthews'] print 'mw', mw print 'Z', Z continue stop_difference if not spacegroup in d_MV.keys(): d_MV[spacegroup] = [] d_MV[spacegroup] += [MV] print pdb, round(MV,2), spacegroup ## fd = open('MV_v_spacegroup.txt','w') ## fd.write(str(d_MV)) ## fd.close() l = ['# MV_average MV_stddev n spacegroup\n'] for spacegroup in d_MV.keys(): l_MV = d_MV[spacegroup] if len(l_MV) <= 1: continue average, stddev = statistics.do_stddev(l_MV) average, stderr = statistics.do_stderr(l_MV) ## l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)] l += ['%s %s %s %s\n' %(average,stderr,len(l_MV),spacegroup,)] fd = open('MV_v_spacegroup.txt','w') fd.writelines(l) fd.close() return