def main(pdb): ## speed up by not reading atom section... d_mmCIF = parse_mmCIF.main(pdb) a = float(d_mmCIF['_cell.length_a'][0]) b = float(d_mmCIF['_cell.length_b'][0]) c = float(d_mmCIF['_cell.length_c'][0]) alpha = float(d_mmCIF['_cell.angle_alpha'][0]) beta = float(d_mmCIF['_cell.angle_beta'][0]) gamma = float(d_mmCIF['_cell.angle_gamma'][0]) Z = int(d_mmCIF['_cell.Z_PDB'][0]) mw = 0 for i in range(len(d_mmCIF['_entity.id'])): if d_mmCIF['_entity.type'][i] == 'polymer': mw += float(d_mmCIF['_entity.formula_weight'][i]) VM = calc( a, b, c, alpha, beta, gamma, mw, Z, ) print pdb, VM return VM
def one_polypeptide(pdb,): l_data_categories = ['_entity_poly',] d = parse_mmCIF.main( pdb, l_data_categories = l_data_categories, ) bool_append = False ## make sure polymer is present (not vacomycin 1aa5) if '_entity_poly.type' in d.keys(): ## one polypeptide? if d['_entity_poly.type'].count('polypeptide(L)') == 1: bool_append = True ## if not ',' in ''.join(d['_entity_poly.pdbx_strand_id']): ## bool_append = True ## list_entity.pdbx_number_of_molecules__1.txt return bool_append
def modres_not_MSE(pdb,): l_data_categories = ['_pdbx_struct_mod_residue'] d = parse_mmCIF.main( pdb, l_data_categories = l_data_categories, ) bool_append = False ## has MODRES if '_pdbx_struct_mod_residue.id' in d.keys(): if d['_pdbx_struct_mod_residue.label_comp_id'] != d['_pdbx_struct_mod_residue.auth_comp_id']: print pdb stop ## at least one MODRES is different from MSE if d['_pdbx_struct_mod_residue.auth_comp_id'] != len(d['_pdbx_struct_mod_residue.auth_comp_id'])*['MSE']: bool_append = True return bool_append
def main(pdb): ## speed up by not reading atom section... d_mmCIF = parse_mmCIF.main(pdb) a = float(d_mmCIF['_cell.length_a'][0]) b = float(d_mmCIF['_cell.length_b'][0]) c = float(d_mmCIF['_cell.length_c'][0]) alpha = float(d_mmCIF['_cell.angle_alpha'][0]) beta = float(d_mmCIF['_cell.angle_beta'][0]) gamma = float(d_mmCIF['_cell.angle_gamma'][0]) Z = int(d_mmCIF['_cell.Z_PDB'][0]) mw = 0 for i in range(len(d_mmCIF['_entity.id'])): if d_mmCIF['_entity.type'][i] == 'polymer': mw += float(d_mmCIF['_entity.formula_weight'][i]) VM = calc(a,b,c,alpha,beta,gamma,mw,Z,) print pdb, VM return VM
def modres_not_MSE(pdb, ): l_data_categories = ['_pdbx_struct_mod_residue'] d = parse_mmCIF.main( pdb, l_data_categories=l_data_categories, ) bool_append = False ## has MODRES if '_pdbx_struct_mod_residue.id' in d.keys(): if d['_pdbx_struct_mod_residue.label_comp_id'] != d[ '_pdbx_struct_mod_residue.auth_comp_id']: print pdb stop ## at least one MODRES is different from MSE if d['_pdbx_struct_mod_residue.auth_comp_id'] != len( d['_pdbx_struct_mod_residue.auth_comp_id']) * ['MSE']: bool_append = True return bool_append
def one_polypeptide(pdb, ): l_data_categories = [ '_entity_poly', ] d = parse_mmCIF.main( pdb, l_data_categories=l_data_categories, ) bool_append = False ## make sure polymer is present (not vacomycin 1aa5) if '_entity_poly.type' in d.keys(): ## one polypeptide? if d['_entity_poly.type'].count('polypeptide(L)') == 1: bool_append = True ## if not ',' in ''.join(d['_entity_poly.pdbx_strand_id']): ## bool_append = True ## list_entity.pdbx_number_of_molecules__1.txt return bool_append
def parse_coords(pdb): d_mmCIF = parse_mmCIF.main(pdb, ) d_coords, l_coords_alpha = mmCIF2coords.main(pdb, d_mmCIF) return d_mmCIF, l_coords_alpha
def parse_dihedrals(): import sys path = '/data/mmCIF' d_phipsi_res = { 'ALA':[],'CYS':[],'ASP':[],'GLU':[],'PHE':[], 'GLY':[],'HIS':[],'ILE':[],'LYS':[],'LEU':[], 'MET':[],'ASN':[],'PRO':[],'GLN':[],'ARG':[], 'SER':[],'THR':[],'VAL':[],'TRP':[],'TYR':[], 'prePRO':[],'prePRO_notGLY':[],'prePRO_GLY':[], 'cisPro':[],'transPro':[], 'all_notgly_notpro_notprepro':[], } d_phipsi_ss = { 'sheet':[], ## _struct_sheet_order.sense ##_struct_conf.pdbx_PDB_helix_class 'helix_alpha':[], ## i+4 # 1 'helix_pi':[], ## i+5 # 3 'helix_310':[], ## i+3 # 5 'Turn':[], ## i+? ## 'turns_notgly_notpro_notprepro':[], } d_counts = { 'cisProALA':0, 'cisProCYS':0, 'cisProASP':0, 'cisProGLU':0, 'cisProPHE':0, 'cisProGLY':0, 'cisProHIS':0, 'cisProILE':0, 'cisProLYS':0, 'cisProLEU':0, 'cisProMET':0, 'cisProASN':0, 'cisProPRO':0, 'cisProGLN':0, 'cisProARG':0, 'cisProSER':0, 'cisProTHR':0, 'cisProVAL':0, 'cisProTRP':0, 'cisProTYR':0, 'cisPro_helix':0, 'cisPro_sheet':0, 'cisPro_turn':0, 'cisPro_random':0, } l_dn = os.listdir(path) l_dn.sort() l_dn.remove('mmCIF.py') for dn in l_dn: if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue print '*',dn l_fn = os.listdir('%s/%s' %(path,dn,)) l_fn.sort() for fn in l_fn: pdb = fn[:4] print pdb d_mmCIF = parse_mmCIF.main( pdb, d_breaks = {'_exptl.method':['SOLUTION NMR']}, l_data_categories = [ '_exptl', '_refine', '_struct_conf', ## HELIX '_struct_sheet_range', ## SHEET '_entity', '_entity_poly', '_entity_poly_seq', '_atom_site', ], ) ## skip NMR models if ''.join(d_mmCIF['_exptl.method']) in [ 'SOLUTION NMR', 'POWDER DIFFRACTION', 'ELECTRON MICROSCOPY', ]: continue if not '_refine.ls_d_res_high' in d_mmCIF.keys(): print d_mmCIF['_exptl.method'] continue ## skip if multiple resolutions if len(d_mmCIF['_refine.ls_d_res_high']) > 1: continue ## skip if no resolution if ''.join(d_mmCIF['_refine.ls_d_res_high']) == '?': continue ## skip low resolution structures if float(''.join(d_mmCIF['_refine.ls_d_res_high'])) > 2: continue if not 'polymer' in d_mmCIF['_entity.type']: continue if not '_entity_poly.type' in d_mmCIF.keys(): ## e.g. 1hhu continue if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide/polyribonucleotide hybrid']: continue if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide']: continue d_sequence = {} for i_entity_poly_seq in range(len(d_mmCIF['_entity_poly_seq.entity_id'])): entity_id = int(d_mmCIF['_entity_poly_seq.entity_id'][i_entity_poly_seq]) if not entity_id in d_sequence.keys(): d_sequence[entity_id] = [] res_no = int(d_mmCIF['_entity_poly_seq.num'][i_entity_poly_seq]) res_name = d_mmCIF['_entity_poly_seq.mon_id'][i_entity_poly_seq] d_sequence[entity_id] += [{'res_no':res_no,'res_name':res_name,}] l_entities_poly = [] for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])): ## skip if not polypeptide entity_poly_type = d_mmCIF['_entity_poly.type'][i_entity_poly] if entity_poly_type != 'polypeptide(L)': continue ## skip if nonstd linkages if d_mmCIF['_entity_poly.nstd_linkage'][i_entity_poly] == 'yes': print pdb stop continue ## parse entity_id and chains entity_id = int(d_mmCIF['_entity_poly.entity_id'][i_entity_poly]) l_entities_poly += [entity_id] ## skip if no polypeptide chains if l_entities_poly == []: continue d_coords = {} for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): entity_id = int(d_mmCIF['_atom_site.label_entity_id'][i_atom_site]) ## not a polymer if not entity_id in l_entities_poly: continue ## polymer, append elif not entity_id in d_coords.keys(): d_coords[entity_id] = {} model = int(d_mmCIF['_atom_site.pdbx_PDB_model_num'][i_atom_site]) if model > 1: continue chain = d_mmCIF['_atom_site.label_asym_id'][i_atom_site] if not chain in d_coords[entity_id].keys(): d_coords[entity_id][chain] = {} res_no = int(d_mmCIF['_atom_site.label_seq_id'][i_atom_site]) if not res_no in d_coords[entity_id][chain].keys(): d_coords[entity_id][chain][res_no] = {} atom_name = d_mmCIF['_atom_site.label_atom_id'][i_atom_site] altloc = d_mmCIF['_atom_site.label_alt_id'][i_atom_site] if altloc not in ['.','A','1',]: continue ## skip if zero occupancy occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site]) if altloc == '.' and occupancy == 0: continue if atom_name in ['CA','C','O','N',] and atom_name in d_coords[entity_id][chain][res_no].keys(): print pdb, chain, res_no, atom_name print d_mmCIF['_atom_site.Cartn_x'][i_atom_site], d_mmCIF['_atom_site.Cartn_y'][i_atom_site] print d_coords[entity_id][chain][res_no][atom_name] stop x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site]) y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site]) z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site]) coord = numpy.array([x,y,z,]) d_coords[entity_id][chain][res_no][atom_name] = coord d_helices = {} ## helices or turns present? if '_struct_conf.id' in d_mmCIF.keys(): for i_struct_conf in range(len(d_mmCIF['_struct_conf.id'])): chain1 = d_mmCIF['_struct_conf.beg_label_asym_id'][i_struct_conf] chain2 = d_mmCIF['_struct_conf.end_label_asym_id'][i_struct_conf] res_no1 = int(d_mmCIF['_struct_conf.beg_label_seq_id'][i_struct_conf]) res_no2 = int(d_mmCIF['_struct_conf.end_label_seq_id'][i_struct_conf]) conf_type_id = d_mmCIF['_struct_conf.conf_type_id'][i_struct_conf] if chain1 != chain2: print chain1, chain2, pdb stop if conf_type_id == 'HELX_P': helix_class = int(d_mmCIF['_struct_conf.pdbx_PDB_helix_class'][i_struct_conf]) elif conf_type_id == 'TURN_P': helix_class = 99 else: print conf_type_id print pdb stop l_res_nos = range(res_no1,res_no2+1,) if not chain1 in d_helices.keys(): d_helices[chain1] = {} for res_no in l_res_nos: d_helices[chain1][res_no] = helix_class d_sheets = {} ## sheet present? if '_struct_sheet_range.sheet_id' in d_mmCIF.keys(): for i_struct_sheet_range in range(len(d_mmCIF['_struct_sheet_range.sheet_id'])): chain1 = d_mmCIF['_struct_sheet_range.beg_label_asym_id'][i_struct_sheet_range] chain2 = d_mmCIF['_struct_sheet_range.end_label_asym_id'][i_struct_sheet_range] res_no1 = int(d_mmCIF['_struct_sheet_range.beg_label_seq_id'][i_struct_sheet_range]) res_no2 = int(d_mmCIF['_struct_sheet_range.end_label_seq_id'][i_struct_sheet_range]) l_res_nos = range(res_no1,res_no2+1,) if chain1 != chain2: print chain1, chain2, pdb stop if not chain1 in d_sheets.keys(): d_sheets[chain1] = [] for res_no in l_res_nos: d_sheets[chain1] += l_res_nos for entity_id in l_entities_poly: for chain in d_coords[entity_id].keys(): ## skip if short peptide (e.g. 13gs) if len(d_sequence[entity_id]) <= 3: continue for i_res_no in range(1,len(d_sequence[entity_id])-1): res_no_prev = int(d_sequence[entity_id][i_res_no-1]['res_no']) res_no = int(d_sequence[entity_id][i_res_no]['res_no']) res_no_next = int(d_sequence[entity_id][i_res_no+1]['res_no']) res_name = d_sequence[entity_id][i_res_no]['res_name'] if res_name == 'MSE': res_name = 'MET' res_name_next = d_sequence[entity_id][i_res_no+1]['res_name'] ## not a standard residue if not res_name in d_phipsi_res.keys(): continue ## residue not observed if not res_no_prev in d_coords[entity_id][chain].keys(): continue if not res_no in d_coords[entity_id][chain].keys(): continue if not res_no_next in d_coords[entity_id][chain].keys(): continue ## atom not observed if not 'C' in d_coords[entity_id][chain][res_no_prev]: continue if not 'N' in d_coords[entity_id][chain][res_no]: continue if not 'CA' in d_coords[entity_id][chain][res_no]: continue if not 'C' in d_coords[entity_id][chain][res_no]: continue if not 'N' in d_coords[entity_id][chain][res_no_next]: continue C_prev = d_coords[entity_id][chain][res_no_prev]['C'] N = d_coords[entity_id][chain][res_no]['N'] CA = d_coords[entity_id][chain][res_no]['CA'] C = d_coords[entity_id][chain][res_no]['C'] N_next = d_coords[entity_id][chain][res_no_next]['N'] phi = calc_dihedral(C_prev,N,CA,C,) psi = calc_dihedral(N,CA,C,N_next,) if 'CA' in d_coords[entity_id][chain][res_no_prev].keys(): CA_prev = d_coords[entity_id][chain][res_no_prev]['CA'] omega = calc_dihedral(CA_prev,C_prev,N,CA,) else: omega = None if omega: if ( omega and omega < 150 and omega > -150 ): ## 12e8, PRO44D if abs(omega) > 30: ## 12e8 PRO196D, 1a44 GLU82A omega = None ## cis else: omega = 'cis' pass ## trans else: omega = 'trans' pass else: omega = None bool_helix = False if chain in d_helices.keys(): if res_no in d_helices[chain].keys(): bool_helix = True helix_class = d_helices[chain][res_no] bool_sheet = False if chain in d_sheets.keys(): if res_no in d_sheets[chain]: bool_sheet = True ## if bool_helix == True and bool_sheet == True and helix_class != 99: ## print pdb, chain, res_no, 'sheet and helix' #### stop if res_name_next == 'PRO': d_phipsi_res['prePRO'] += [[phi,psi,]] if res_name != 'GLY': d_phipsi_res['prePRO_notGLY'] += [[phi,psi,]] else: d_phipsi_res['prePRO_GLY'] += [[phi,psi,]] else: d_phipsi_res[res_name] += [[phi,psi,]] if res_name not in ['GLY','PRO',]: d_phipsi_res['all_notgly_notpro_notprepro'] += [[phi,psi,]] elif res_name == 'PRO' and omega: d_phipsi_res['%sPro' %(omega)] += [[phi,psi,]] if omega == 'cis': d_counts['cisPro%s' %(res_name)] += 1 if bool_helix == True: if helix_class == 1: d_counts['cisPro_helix'] += 1 elif helix_class == 99: d_counts['cisPro_turn'] += 99 elif bool_sheet == True: d_counts['cisPro_sheet'] += 1 else: d_counts['cisPro_random'] += 1 if bool_helix == True: ## if helix_class not in [1,3,5,99,]: ## print pdb, chain, res_no, helix_class ## print 'unexpected helix class' #### stop_helix_class if helix_class == 1: d_phipsi_ss['helix_alpha'] += [[phi,psi,]] elif helix_class == 3: d_phipsi_ss['helix_pi'] += [[phi,psi,]] elif helix_class == 5: d_phipsi_ss['helix_310'] += [[phi,psi,]] elif helix_class == 99: d_phipsi_ss['Turn'] += [[phi,psi,]] if ( res_name_next != 'PRO' and res_name not in ['GLY','PRO',] ): d_phipsi_ss['turns_notgly_notpro_notprepro'] += [[phi,psi,]] if bool_sheet == True: d_phipsi_ss['sheet'] += [[phi,psi,]] l = [] for k in d_counts.keys(): count = d_counts[k] l += ['%s %s\n' %(k,count,)] fd = open('count.txt','w') fd.writelines(l) fd.close() return d_phipsi_res, d_phipsi_ss
def unobs_nonterminal_residues(): ## ## unobs or zero occup not at terminals!!! (combination...) ## eg dont exlude 200l w 163,164 missing ## dont exclude 201l w 163,164 missing, but internally in _pdbx_poly_seq_scheme because 2 chains ## category = fn = '_pdbx_unobs_or_zero_occ_residues' fd = open('%s/list%s.txt' %(path,fn)) s = fd.read() fd.close() l_pdbs_in = s.split() l_data_categories = [ '_pdbx_poly_seq_scheme', '_pdbx_unobs_or_zero_occ_residues', '_entity_poly', ] fn_out = 'list_pdbx_unobs_residues__NONTERMINAL' loop_residues(category,fn_out,) l_pdbs_out = [] for pdb in l_pdbs_in: ## if pdb[1:3] < 'oa': ## continue ## if pdb != '2hub': ## continue ## no residues are present! (e.g. 1oax, 1oay) if pdb in ['1oax','1oay',]: continue d = parse_mmCIF.main(pdb,l_data_categories=l_data_categories,) ## print pdb if not category in d.keys(): continue bool_append = False s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id']) for chains in d['_entity_poly.pdbx_strand_id']: for chain in chains.split(','): index1 = s.index(chain) index2 = s.rindex(chain) ## print chain l_auth_seq_num = d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:index2+1] while l_auth_seq_num[0] == '?': l_auth_seq_num = l_auth_seq_num[1:] while l_auth_seq_num[-1] == '?': l_auth_seq_num = l_auth_seq_num[:-1] ## non-terminal residues missing? if '?' in l_auth_seq_num: print '****', pdb bool_append = True break if bool_append == True: break if bool_append == True: print pdb l_pdbs_out += [pdb] ## continue fd = open('%s/%s' %(path,fn_out,),'w') fd.write('\n'.join(l_pdbs_out)) fd.close() return
def main(): d = {} if os.path.isfile('db_resolution.txt'): fd = open('db_resolution.txt', 'r') lines = fd.readlines() fd.close() for line in lines: l = line.strip().split() pdb = l[0] v = l[1] d[pdb] = v path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() lines_out = [] for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' % (path, dn)): continue print '%s/%s %s' % (i + 1, len(l_dns), dn) l_fns = os.listdir('%s/%s' % (path, dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d.keys(): continue print pdb fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb, lines, l_data_categories=[ '_refine', '_refine_hist', ], ## parse selected data categories l_data_categories_break=[ '_refine', ## '_refine_hist', ], d_breaks_negation={ ## break if not x-ray diffraction '_exptl.method': 'X-RAY DIFFRACTION', }) if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue resolution = d_mmCIF['_refine.ls_d_res_high'] line = '%s %s\n' % ( pdb, resolution, ) lines_out += [line] fd = open('db_resolution.txt', 'a') fd.write(line) fd.close() d[pdb] = resolution ## ## write to file ## lines_out = [] for pdb, resolution in d.items(): line = '%s %s\n' % ( pdb, resolution, ) lines_out += [line] fd = open('db_resolution.txt', 'w') fd.writelines(lines_out) fd.close() d = {} fd = open('db_resolution.txt', 'r') lines = fd.readlines() fd.close() lines_out = [] for line in lines: resolution = line.strip().split()[1][2:-2] if resolution == '.': continue resolution = float(resolution) resolution = round(resolution, 2) if not resolution in d.keys(): d[resolution] = 0 d[resolution] += 1 lines_out += ['%s\n' % (resolution, )] fd = open('histogram_resolution.txt', 'w') fd.writelines(lines_out) fd.close() stop lines_out = [] l_resolutions = d.keys() l_resolutions.sort() ## for resolution,count in d.items(): for resolution in l_resolutions: count = d[resolution] lines_out += ['%s %s\n' % ( resolution, count, )] fd = open('histogram_resolution.txt', 'w') fd.writelines(lines_out) fd.close() return
def main(): fd = open('radius_of_gyration.txt','r') lines = fd.readlines() fd.close() d_radii = {} for line in lines: l = line.strip().split() pdb = l[0] r = l[1] d_radii[pdb] = r lines_out = [] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' %(path,dn)): continue print '%s/%s %s' %(i+1,len(l_dns), dn) l_fns = os.listdir('%s/%s' %(path,dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d_radii.keys(): continue print pdb fd = open('%s/%s/%s' %(path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb,lines, d_breaks = { ## break if multiple polymer types (not monomeric) '_entity_poly.entity_id':'2', ## '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR ## break if multiple chains '_entity_poly.pdbx_strand_id':',', }, d_breaks_negation = { ## break if not x-ray diffraction '_exptl.method':'X-RAY DIFFRACTION', ## break if not monomeric '_pdbx_struct_assembly.oligomeric_details':'monomeric', }, l_data_categories = [ '_atom_site', '_entity_poly', '_pdbx_struct_assembly', ], ## parse selected data categories ) ## some unknown temporary error... or break before reaching this part when parsing... if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys(): continue ## NMR structure? if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: stop2 continue ## no polymers in structure? if not '_entity_poly.entity_id' in d_mmCIF.keys(): continue ## polymer(s) is/are not polypeptide(s) if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']: continue ## biounit not monomeric if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_details'])*['monomeric']: continue ## one polymer in assymetric unit if len(d_mmCIF['_entity_poly.entity_id']) > 1: continue print pdb ## ## calculate center of mass ## center_of_mass = numpy.array([0.,0.,0.,]) l_coords = [] l_masses = [] for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): if d_mmCIF['_atom_site.label_entity_id'][i_atom_site] not in d_mmCIF['_entity_poly.entity_id']: continue element = d_mmCIF['_atom_site.type_symbol'][i_atom_site] ## only do heavy atoms if element == 'H': continue if element not in d_mass.keys(): print pdb, d_mmCIF['_atom_site.type_symbol'][i_atom_site] continue mass = d_mass[element] l_masses += [mass] x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site]) y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site]) z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site]) coord = numpy.array([x,y,z,]) l_coords += [coord] center_of_mass += mass*coord center_of_mass /= sum(l_masses) ## ## calculate radius of gyration ## sum_r = 0 for i_coord in range(len(l_coords)): coord = l_coords[i_coord] mass = l_masses[i_coord] sq_dist_from_center_of_mass = sum((coord-center_of_mass)**2) sum_r += mass*sq_dist_from_center_of_mass radius_of_gyration = math.sqrt(sum_r/sum(l_masses)) print pdb, center_of_mass, radius_of_gyration line = '%s %s\n' %(pdb,radius_of_gyration,) lines_out += [line] fd = open('radius_of_gyration.txt','a') fd.write(line) fd.close() d_radii[pdb] = radius_of_gyration ## ## write calculated radii of gyration to file ## lines_out = [] for pdb,radius_of_gyration in d_radii.items(): line = '%s %s\n' %(pdb,radius_of_gyration,) lines_out += [line] fd = open('radius_of_gyration.txt','w') fd.writelines(lines_out) fd.close() return
def one_polysaccharide(pdb,): l_data_categories = [ '_entity', '_chem_comp', '_entity_poly', ] d = parse_mmCIF.main( pdb, l_data_categories = l_data_categories, ) bool_append = False bool_polysaccharide = False if '_chem_comp.type' in d.keys(): for chem_comp_type in d['_chem_comp.type']: if chem_comp_type.lower() in [ 'd-saccharide 1,4 and 1,4 linking', # 3amm 'l-saccharide','d-saccharide','saccharide' ]: bool_polysaccharide = True break ## elif 'acchar' in chem_comp_type.lower(): ## print d ## print chem_comp_type ## print pdb ## print set(['D-saccharide','saccharide'])&set(d['_chem_comp.type']) ## stop ## else: ## print pdb ## stop count_polymer_sugar = 0 bool_monosaccharide = False ## included to exclude 1a14 which contains polymers and monomers for i in range(len(d['_entity.type'])): entity_type = d['_entity.type'][i] if entity_type in [ 'polymer', ]: if d['_entity.pdbx_description'][i][:7] == 'SUGAR (': count_polymer_sugar += int(d['_entity.pdbx_number_of_molecules'][i]) continue ## ## polypeptide or polynucleotide (just a check) ## elif d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## eg 2c49 ## if d['_entity.id'][i] not in d['_entity_poly.entity_id']: ## print pdb ## stop elif entity_type == 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR': bool_monosaccharide = True ## ## just a check ## if d['_entity.pdbx_description'][i][:7] != 'SUGAR (' and pdb not in ['1iuc',]: ## print pdb ## print d['_entity.pdbx_description'][i] ## stop ## ## anything else named SUGAR? just a check ## elif entity_type != 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## print d ## print pdb ## print entity_type ## print d['_entity.pdbx_description'][i] ## stop if bool_monosaccharide == False and bool_polysaccharide == True and count_polymer_sugar == 1: bool_append = True ## elif pdb in ['3gvj','3gvk','3gvl','3hmy','3msg','1v0f',]: ## bool_append = False ## ## error check ## elif bool_polysaccharide == False and count_polymer_sugar > 0: ## print d ## print bool_polysaccharide ## print d['_entity.pdbx_description'] ## print count_polymer_sugar ## print pdb ## stop_no_poly_but_poly if pdb == '1dl2': print count_polymer_sugar print bool_append stop return bool_append
def main(): fd = open('remediation_negativeBiso.txt', 'r') lines = fd.readlines() fd.close() l_pdbs = [] for line in lines: if line.strip() == '': continue if line[0] == '#': continue l = line.strip().split() pdb = l[0] l_pdbs += [pdb] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' % (path, dn)): continue print '%s/%s %s' % (i + 1, len(l_dns), dn) l_fns = os.listdir('%s/%s' % (path, dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if not pdb in l_pdbs: continue print pdb fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb, lines, d_breaks_negation={ ## break if not x-ray diffraction '_exptl.method': 'X-RAY DIFFRACTION', }, l_data_categories=[ ## parse selected data categories '_database_PDB_rev', '_computing', '_atom_site', '_refine' ], ) ## ## no polymers in structure? ## if not '_entity_poly.entity_id' in d_mmCIF.keys(): ## continue if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue print pdb ## ## parse bfactors ## for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): bfactor = float( d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site]) ## if bfactor == '?': ## continue element = d_mmCIF['_atom_site.type_symbol'][i_atom_site] comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site] if float(bfactor) < -0.01: if (element != 'H' and comp_id in [ 'ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR', ]): print print 'negative' print year = int(d_mmCIF['_database_PDB_rev.date'][0][:4]) atom_id = int(d_mmCIF['_atom_site.id'][i_atom_site]) refinement = ''.join( d_mmCIF['_computing.structure_refinement']) solution = ''.join( d_mmCIF['_computing.structure_solution']) resolution = float(''.join( d_mmCIF['_refine.ls_d_res_high'])) fd = open('remediation_negativeBiso.txt', 'a') fd.write( ## '%4s %4i %4i %3s %2s %6.2f %30s %20s\n' %( '%4s\t%4i\t%4i\t%3s\t%2s\t%6.2f\t%6.2f\t%30s\t%20s\n' % ( pdb, year, atom_id, comp_id, element, bfactor, resolution, solution.ljust(30), refinement.ljust(20), )) fd.close() break return
def main(): d_MV = {} path = '/data/mmCIF' l_dn = os.listdir(path) l_dn.sort() for dn in l_dn: if dn == 'mmCIF.py': continue if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue l_fn = os.listdir('%s/%s' % (path, dn)) for fn in l_fn: pdb = fn[:4] ## if pdb.upper() not in s_pdbs: ## continue d_mmCIF = parse_mmCIF.main( pdb, d_breaks={'_exptl.method': 'SOLUTION NMR'}, l_data_categories=[ '_cell', '_entity', '_exptl', '_exptl_crystal', '_entity_poly', '_symmetry', ## virus '_pdbx_struct_assembly', ## split structure '_pdbx_database_related', ], ) ## x-ray structure if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue ## polymer present if not '_entity_poly.type' in d_mmCIF.keys(): continue ## only polymer present is protein if d_mmCIF['_entity_poly.type'] != len( d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']: continue if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys(): continue if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len( d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['?']: continue ## virus if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] [0]) % 60 == 0: continue ## not monomer if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len( d_mmCIF['_pdbx_struct_assembly.oligomeric_count']) * ['1']: continue ## split structure if '_pdbx_database_related' in d_mmCIF.keys(): if 'split' in d_mmCIF['_pdbx_database_related']: continue if 'SPLIT' in d_mmCIF['_pdbx_database_related']: print pdb stop if not '_cell.Z_PDB' in d_mmCIF.keys(): continue if pdb in [ ## treshold '1e54', '1e9i', ## difference between calculated MV and MV in mmCIF '3eiq', ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1 ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01. ## Toscana has published with Hellinga... '2cjf', '2bt4', ]: continue ## if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [ ## 'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3', ## ]: ## continue ## tmp!!! a = float(d_mmCIF['_cell.length_a'][0]) b = float(d_mmCIF['_cell.length_b'][0]) c = float(d_mmCIF['_cell.length_c'][0]) alpha = float(d_mmCIF['_cell.angle_alpha'][0]) beta = float(d_mmCIF['_cell.angle_beta'][0]) gamma = float(d_mmCIF['_cell.angle_gamma'][0]) Z = int(d_mmCIF['_cell.Z_PDB'][0]) mw = 0 for i in range(len(d_mmCIF['_entity.id'])): ## if d_mmCIF['_entity.type'][i] == 'polymer': s = d_mmCIF['_entity.formula_weight'][i] ## unknown ligand if s == '?': continue mw += float(s) MV = matthews_coefficient.main(a, b, c, alpha, beta, gamma, mw, Z) spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) if spacegroup not in [ 'F 4 3 2', 'F 41 3 2', 'I 41 3 2', ]: continue ## tmp!!! if MV > 10: print pdb print 'mw', mw print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews'] print 'Z', Z import math alpha *= math.pi / 180. beta *= math.pi / 180. gamma *= math.pi / 180. V = a * b * c * math.sqrt( 1 - math.cos(alpha)**2 - math.cos(beta)**2 - math.cos(gamma)**2 + 2 * (math.cos(alpha) * math.cos(beta) * math.cos(gamma))) print 'V', V continue stop_treshold stop if '_exptl_crystal.density_Matthews' in d_mmCIF.keys(): if d_mmCIF['_exptl_crystal.density_Matthews'] not in [ ['?'], len(d_mmCIF['_exptl_crystal.density_Matthews']) * ['?'], ]: if abs(MV - float(d_mmCIF['_exptl_crystal.density_Matthews'][0]) ) > 1: print 'MV', MV print 'MV', d_mmCIF['_exptl_crystal.density_Matthews'] print 'mw', mw print 'Z', Z continue stop_difference if not spacegroup in d_MV.keys(): d_MV[spacegroup] = [] d_MV[spacegroup] += [MV] print pdb, round(MV, 2), spacegroup ## fd = open('MV_v_spacegroup.txt','w') ## fd.write(str(d_MV)) ## fd.close() l = ['# MV_average MV_stddev n spacegroup\n'] for spacegroup in d_MV.keys(): l_MV = d_MV[spacegroup] if len(l_MV) <= 1: continue average, stddev = statistics.do_stddev(l_MV) average, stderr = statistics.do_stderr(l_MV) ## l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)] l += ['%s %s %s %s\n' % ( average, stderr, len(l_MV), spacegroup, )] fd = open('MV_v_spacegroup.txt', 'w') fd.writelines(l) fd.close() return
def get_position_ligand(pdb,pdb_apo,d_apo2holo,): pdb_holo = d_apo2holo[pdb_apo]['holo'] d_mmCIF_holo = parse_mmCIF.main(pdb_holo,) d_coords, l_coords_alpha_holo = mmCIF2coords.main(pdb_holo,d_mmCIF_holo) ## ## ## ligand = d_apo2holo[pdb_apo]['ligand'] l_residues = [] for i in range(len(d_mmCIF_holo['_struct_site.id'])): if not 'BINDING SITE FOR RESIDUE %s' %(ligand) in d_mmCIF_holo['_struct_site.details'][i]: continue if len(l_residues) > 0: print pdb, pdb_apo, pdb_holo print l_residues print d_mmCIF_holo['_struct_site.details'][i] stop struct_site_ID = d_mmCIF_holo['_struct_site.id'][i] for j in range(len(d_mmCIF_holo['_struct_site_gen.site_id'])): struct_site_gen_ID = d_mmCIF_holo['_struct_site_gen.site_id'][j] if struct_site_ID == struct_site_gen_ID: residue = int(d_mmCIF_holo['_struct_site_gen.auth_seq_id'][j]) ## l_residues += [residue] ## include neighboring residues l_residues += [residue-1,residue,residue+1] l_residues = list(set(l_residues)) if len(l_residues) == 0: print pdb stop ## l_coords_ligand = [] for i in range(len(d_mmCIF_holo['_atom_site.id'])): if ( d_mmCIF_holo['_atom_site.group_PDB'][i] == 'HETATM' and d_mmCIF_holo['_atom_site.label_comp_id'][i] == ligand ): x = float(d_mmCIF_holo['_atom_site.Cartn_x'][i]) y = float(d_mmCIF_holo['_atom_site.Cartn_y'][i]) z = float(d_mmCIF_holo['_atom_site.Cartn_z'][i]) coord = numpy.array([x,y,z,]) l_coords_ligand += [coord] d_mmCIF_apo = parse_mmCIF.main(pdb_apo,) d_coords, l_coords_alpha_apo = mmCIF2coords.main(pdb_apo,d_mmCIF_apo) ## structural alignment ## solution that works in all cases ## also for 2d59 and 2d5a, which have residues missing at the Nterm and Cterm, respectively ## first non-? index1_seq_apo = next((i for i,v in enumerate(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?')) index1_seq_holo = next((i for i,v in enumerate(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?')) ## last non-? index2_seq_apo = len(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?')) index2_seq_holo = len(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?')) ## first common non-? index1_coord_apo = max(0,index1_seq_holo-index1_seq_apo) index1_coord_holo = max(0,index1_seq_apo-index1_seq_holo) ## last common non-? index2_coord_apo = len(l_coords_alpha_apo)+min(0,index2_seq_holo-index2_seq_apo) index2_coord_holo = len(l_coords_alpha_holo)+min(0,index2_seq_apo-index2_seq_holo) l_coords_alpha_apo = l_coords_alpha_apo[index1_coord_apo:index2_coord_apo] l_coords_alpha_holo = l_coords_alpha_holo[index1_coord_holo:index2_coord_holo] if pdb == pdb_apo: l_seq_num = d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_seq_num'][index1_coord_apo:index2_coord_apo] chain = ''.join(d_mmCIF_apo['_entity_poly.pdbx_strand_id']) n_residues = len(l_coords_alpha_apo) l_coords_alpha = l_coords_alpha_apo else: l_seq_num = d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_seq_num'][index1_coord_holo:index2_coord_holo] chain = ''.join(d_mmCIF_holo['_entity_poly.pdbx_strand_id']) n_residues = len(l_coords_alpha_holo) l_coords_alpha = l_coords_alpha_holo overlap_site = 1. ## ## ## ## eigenvector ## ## ## cutoff = 10 ## matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,) ## eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian) ## ## ## apply transformation matrix ## if pdb == pdb_apo: ## instance_geometry = geometry.geometry() ## rmsd = instance_geometry.superpose(l_coords_alpha_apo,l_coords_alpha_holo,) ## tv1 = instance_geometry.fitcenter ## rm = instance_geometry.rotation ## tv2 = instance_geometry.refcenter ## for i_coord in range(len(l_coords_ligand)): ## l_coords_ligand[i_coord] = numpy.dot(l_coords_ligand[i_coord]-tv1,rm)+tv2 ## ## ## ## ## apo/holo eigenvector ## ## ## vector_apo2holo = [] ## for i in range(len(l_coords_alpha_holo)): ## vector_apo2holo += [ ## l_coords_alpha_holo[i][0]-l_coords_alpha_apo[i][0], ## l_coords_alpha_holo[i][1]-l_coords_alpha_apo[i][1], ## l_coords_alpha_holo[i][2]-l_coords_alpha_apo[i][2], ## ] ## vector_apo2holo = numpy.array(vector_apo2holo) ## ## ## ## ## calculate overlap between normal modes and difference vector ## ## in the ligand binding site!!! ## ## ## vector_apo2holo_site = [] ## eigenvector_site = [] ## ## exclude coordinate not at the ligand binding site ## for i_seq_num in range(len(l_seq_num)): ## seq_num = int(l_seq_num[i_seq_num]) ## if seq_num in l_residues: ## eigenvector_site += list(eigenvectors[6][3*i_seq_num:3*i_seq_num+3]) ## vector_apo2holo_site += list(vector_apo2holo[3*i_seq_num:3*i_seq_num+3]) ## ## calculate overlap ## vector_apo2holo_site = numpy.array(vector_apo2holo_site) ## eigenvector_site = numpy.array(eigenvector_site) ## overlap_site = abs( ## numpy.dot(eigenvector_site,vector_apo2holo_site) ## / ## math.sqrt( ## numpy.dot(eigenvector_site,eigenvector_site) ## * ## numpy.dot(vector_apo2holo_site,vector_apo2holo_site) ## ) ## ) ## if overlap_site > 0.8: ## print vector_apo2holo_site ## print eigenvector_site ## print pdb ## print l_residues position_ligand = sum(l_coords_ligand)/len(l_coords_ligand) n_atoms = len(l_coords_ligand) return position_ligand, chain, n_residues, n_atoms, ligand, overlap_site
def main(): set_pdbs = exclude_include() l_pdbs_remove = [ '4a3h','2wf5','1arl','1ee3', ## incorrect _struct_ref_seq.pdbx_db_accession '1uyd','1uye','1uyf','2byh','2byi', ## remediation _struct_ref_seq_dif '2xdu','3dn8','3dna','1ps3','1ouf','1l35','2eun','1rtc','1zon', ## _struct_ref_seq_dif missing '1pwl','1pwm','2fz8','2fz9', ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code ] set_pdbs.remove('1f92') ## remediation _struct_ref_seq_dif incorrect residue number set_pdbs.remove('2f6f') ## remediation _pdbx_poly_seq_scheme.auth_mon_id wrong set_pdbs.remove('3a5j') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be MET set_pdbs.remove('2rhx') ## remediation _struct_ref_seq_dif.db_mon_id is ? but should be SER set_pdbs.remove('2fzb') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('2fzd') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('3dn5') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1x96') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1x97') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1x98') ## remediation incorrect _struct_ref.pdbx_seq_one_letter_code set_pdbs.remove('1z3n') ## GenBank DBref - not an error... set_pdbs.remove('1z8a') ## GenBank DBref - not an error... set_pdbs.remove('1z89') ## GenBank DBref - not an error... set_pdbs.remove('2pf8') ## stupid use of alt_ids (C for highest occupancy and only altloc) set_pdbs.remove('2pyr') ## stupid use of alt_ids (G and R) set_pdbs.remove('3pdn') ## stupid use of alt_ids (B and C) set_pdbs.remove('2v4c') ## alt_id B used for 100% occupancy atoms set_pdbs.remove('1jxt') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxu') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxw') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxx') ## weird alt_id microheterogeneity... set_pdbs.remove('1jxy') ## weird alt_id microheterogeneity... ## set_pdbs.remove('1ac4') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... ## set_pdbs.remove('1ac8') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... ## set_pdbs.remove('1aeb') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... ## set_pdbs.remove('2rbt') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789 ## set_pdbs.remove('2rbu') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789 ## set_pdbs.remove('2rbv') ## multiple strains and taxonomy ids but all same organism (S. cerevisiae)... UNP A7A026, TAX 307796, STRAIN YJM789 for pdb in l_pdbs_remove: set_pdbs.remove(pdb) fd = open('%s/bc-100.out' %(path_mmCIF),'r') lines = fd.readlines() fd.close() for i_line in range(len(lines)): cluster = i_line if cluster < 4816: continue ## if cluster not in [5,]: ## continue line = lines[i_line] l_pdbs = line.lower().split() l_pdbs.sort() for i_pdb in range(len(l_pdbs)): l_pdbs[i_pdb] = l_pdbs[i_pdb][:4] for i_pdb1 in range(0,len(l_pdbs)-1): pdb1 = l_pdbs[i_pdb1] ## if pdb1 != '1t49': ## tmp!!! ## continue if not pdb1 in set_pdbs: continue print pdb1 stop d_mmCIF1 = parse_mmCIF.main(pdb1,) bool_monomeric = check_monomeric(d_mmCIF1) if bool_monomeric == False: if i_pdb1 == 0: break else: continue bool_remediation_modres = check_modres(d_mmCIF1,pdb1,) if bool_remediation_modres == True: continue if '_struct_ref_seq_dif.details' in d_mmCIF1.keys(): if 'DELETION' in d_mmCIF1['_struct_ref_seq_dif.details']: continue for i_entity in range(len(d_mmCIF1['_entity.id'])): if d_mmCIF1['_entity.type'][i_entity] == 'polymer': if int(d_mmCIF1['_entity.pdbx_number_of_molecules'][i_entity]) != 1: print d_mmCIF1['_entity.pdbx_number_of_molecules'] print pdb1, cluster stop SG1 = d_mmCIF1['_symmetry.space_group_name_H-M'] for i_pdb2 in range(i_pdb1+1,len(l_pdbs)): pdb2 = l_pdbs[i_pdb2] ## if pdb2 != '2pf8': ## tmp!!! ## continue ## if pdb1 != '3fui' or pdb2 != '3fuj': ## continue if not pdb2 in set_pdbs: continue d_mmCIF2 = parse_mmCIF.main(pdb2,) bool_monomeric = check_monomeric(d_mmCIF2) if bool_monomeric == False: continue bool_remediation_modres = check_modres(d_mmCIF2,pdb2,) if bool_remediation_modres == True: continue if '_struct_ref_seq_dif.seq_num' in d_mmCIF2.keys(): if 'DELETION' in d_mmCIF2['_struct_ref_seq_dif.details']: continue ## biounit monomeric? for i_entity in range(len(d_mmCIF2['_entity.id'])): if d_mmCIF2['_entity.type'][i_entity] == 'polymer': if int(d_mmCIF2['_entity.pdbx_number_of_molecules'][i_entity]) != 1: continue SG2 = d_mmCIF2['_symmetry.space_group_name_H-M'] if SG1 != SG2: continue ## parse coordinates again after being shortened in previous loop try: d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1) except: fd = open('remediation_atom_site.label_alt_id.txt','a') fd.write('%s\n' %(pdb1,)) fd.close() try: d_coords2, l_coords_alpha2 = mmCIF2coords.main(pdb2, d_mmCIF2) except: fd = open('remediation_atom_site.label_alt_id.txt','a') fd.write('%s\n' %(pdb2,)) fd.close() ## align sequences/coordinates try: l_coords_alpha1, l_coords_alpha2 = create_apo_holo_dataset.sequential_alignment_of_coordinates( l_coords_alpha1, l_coords_alpha2, d_mmCIF1, d_mmCIF2, pdb1, pdb2, ) except: fd = open('remediation_struct_ref_seq_dif.txt','a') fd.write( '%s %s %s %s\n' %( pdb1,pdb2, d_mmCIF1['_struct_ref_seq.pdbx_db_accession'], d_mmCIF2['_struct_ref_seq.pdbx_db_accession'], ) ) fd.close() continue if len(l_coords_alpha1) != len(l_coords_alpha2): print d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id'] print d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id'] print 'coords', len(l_coords_alpha1), len(l_coords_alpha2) print 'seq', len(d_mmCIF1['_pdbx_poly_seq_scheme.pdb_mon_id']) print 'seq', len(d_mmCIF2['_pdbx_poly_seq_scheme.pdb_mon_id']) print pdb1, pdb2 d_coords1, l_coords_alpha1 = mmCIF2coords.main(pdb1, d_mmCIF1) d_coords1, l_coords_alpha2 = mmCIF2coords.main(pdb1, d_mmCIF2) print len(l_coords_alpha1), len(l_coords_alpha2) stop continue ## ## align structure 1 and 2 ## instance_geometry = geometry.geometry() rmsd = instance_geometry.superpose(l_coords_alpha1,l_coords_alpha2) tv1 = instance_geometry.fitcenter rm = instance_geometry.rotation tv2 = instance_geometry.refcenter ## structural alignment for i_coord in range(len(l_coords_alpha2)): l_coords_alpha2[i_coord] = numpy.dot(l_coords_alpha2[i_coord]-tv1,rm)+tv2 ## ## vector from structure 1 to 2 ## vector = [] for i in range(len(l_coords_alpha1)): vector += [ l_coords_alpha1[i][0]-l_coords_alpha2[i][0], l_coords_alpha1[i][1]-l_coords_alpha2[i][1], l_coords_alpha1[i][2]-l_coords_alpha2[i][2], ] vector = numpy.array(vector) ## ## calculate normal modes of structure 1 ## cutoff = 10 try: matrix_hessian1 = NMA.hessian_calculation(l_coords_alpha1, cutoff, verbose = False) eigenvectors1, eigenvalues1 = NMA.diagonalize_hessian(matrix_hessian1, verbose = False) matrix_hessian2 = NMA.hessian_calculation(l_coords_alpha2, cutoff, verbose = False) eigenvectors2, eigenvalues2 = NMA.diagonalize_hessian(matrix_hessian2, verbose = False) except: continue ## ## calculate overlap between normal modes and difference vector ## eigenvector1 = eigenvectors1[6] eigenvector2 = eigenvectors2[6] overlap1 = calc_overlap(eigenvector1,vector) overlap2 = calc_overlap(eigenvector2,vector) overlap3a = calc_overlap(eigenvector1,eigenvector2) overlap3b = calc_overlap(eigenvectors1[6],eigenvectors2[7]) overlap3c = calc_overlap(eigenvectors1[7],eigenvectors2[6]) overlap3 = max(overlap3a,overlap3b,overlap3c) fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap1)) fd.close() fd = open('rmsd_v_overlap2/cluster%i.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap2)) fd.close() fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap3a)) fd.close() fd = open('rmsd_v_overlap2/cluster%i_ev_v_ev_max.txt' %(i_line),'a') fd.write('%s %s\n' %(rmsd,overlap3)) fd.close() print pdb1, pdb2, 'cluster', i_line, 'size', len(l_pdbs), print 'overlap', '%4.2f' %(round(overlap1,2)), '%4.2f' %(round(overlap2,2)), '%4.2f' %(round(overlap3,2)), 'rmsd', '%4.2f' %(round(rmsd,2)) return
def main(): fn_out = 'db_MatthewsCoefficient.txt' fd = open(fn_out,'r') lines = fd.readlines() fd.close() d = {} for line in lines: l = line.strip().split() pdb = l[0] v = l[1] if pdb == '2p51': v = '1.72610466393' d[pdb] = v lines_out = [] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' %(path,dn)): continue print '%s/%s %s' %(i+1,len(l_dns), dn) l_fns = os.listdir('%s/%s' %(path,dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d.keys(): continue ## Matthews Coefficient not calculated... if pdb in [ '1vh7','1vho','1vhu','1vi3','1vi4','1vis', ]: continue ## Matthews Coefficient *wrong* if pdb in [ '2p51', ## too high '1c5v','1q9i','1ut6','1x6x','1x6y','1xdn','1y63','1zix', ## too low '1t95','1jih','1t95','1d5t','1c7k', '1dbo','1d9x','1qt9','1ia5','1dcq', ]: continue fd = open('%s/%s/%s' %(path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb,lines, d_breaks = { ## break if multiple polymer types (not monomeric) '_entity_poly.entity_id':'2', ## '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR ## break if multiple chains '_entity_poly.pdbx_strand_id':',', }, d_breaks_negation = { ## break if not x-ray diffraction '_exptl.method':'X-RAY DIFFRACTION', ## break if not monomeric '_pdbx_struct_assembly.oligomeric_details':'monomeric', }, l_data_categories = [ '_exptl_crystal', ], ## parse selected data categories l_data_categories_break = ['_exptl_crystal'] ) ## some unknown temporary error... or break before reaching this part when parsing... if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys(): continue ## NMR structure? if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: stop2 continue ## no polymers in structure? if not '_entity_poly.entity_id' in d_mmCIF.keys(): continue ## polymer(s) is/are not polypeptide(s) if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']: continue ## biounit not monomeric if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_details'])*['monomeric']: continue ## one polymer in assymetric unit if len(d_mmCIF['_entity_poly.entity_id']) > 1: continue if d_mmCIF['_exptl_crystal.density_Matthews'] == ['?']: v = VM = calc_matthews_coefficient.main(pdb) ## continue else: v = float(''.join(d_mmCIF['_exptl_crystal.density_Matthews'])) line = '%s %s\n' %(pdb,v,) fd = open(fn_out,'a') fd.write(line) fd.close() d[pdb] = v ## ## write calculated radii of gyration to file ## lines_out = [] for pdb,v in d.items(): line = '%s %s\n' %(pdb,v,) lines_out += [line] fd = open(fn_out,'w') fd.writelines(lines_out) fd.close() return
def main(): d_MV = {} path = '/data/mmCIF' l_dn = os.listdir(path) l_dn.sort() for dn in l_dn: if dn == 'mmCIF.py': continue if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue l_fn = os.listdir('%s/%s' %(path,dn)) for fn in l_fn: pdb = fn[:4] ## if pdb.upper() not in s_pdbs: ## continue d_mmCIF = parse_mmCIF.main( pdb, d_breaks = {'_exptl.method':'SOLUTION NMR'}, l_data_categories = [ '_cell','_entity','_exptl','_exptl_crystal', '_entity_poly', '_symmetry', ## virus '_pdbx_struct_assembly', ## split structure '_pdbx_database_related', ], ) ## x-ray structure if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue ## polymer present if not '_entity_poly.type' in d_mmCIF.keys(): continue ## only polymer present is protein if d_mmCIF['_entity_poly.type'] != len(d_mmCIF['_entity_poly.type'])*['polypeptide(L)']: continue if not '_pdbx_struct_assembly.oligomeric_count' in d_mmCIF.keys(): continue if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] == len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['?']: continue ## virus if int(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'][0]) % 60 == 0: continue ## not monomer if d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] != len(d_mmCIF['_pdbx_struct_assembly.oligomeric_count'])*['1']: continue ## split structure if '_pdbx_database_related' in d_mmCIF.keys(): if 'split' in d_mmCIF['_pdbx_database_related']: continue if 'SPLIT' in d_mmCIF['_pdbx_database_related']: print pdb stop if not '_cell.Z_PDB' in d_mmCIF.keys(): continue if pdb in [ ## treshold '1e54','1e9i', ## difference between calculated MV and MV in mmCIF '3eiq', ## The crystals diffracted to 1.7Angstrom and appeared to be I centered tetragonal with ## unit cell dimension a=198.42Angstrom and c=396.6Angstrom, however the data only merged successfully in P1 ## unit cell a=196.61 b=196.48 c=240.63 alpha=65.91 beta=65.91 gamma=90.01. ## Toscana has published with Hellinga... '2cjf','2bt4', ]: continue ## if not ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) in [ ## 'P 1','P 43 21 2','P 21 3','P 42 3 2','C 1 2 1','F 2 3','P 64 2 2','H 3', ## ]: ## continue ## tmp!!! a = float(d_mmCIF['_cell.length_a'][0]) b = float(d_mmCIF['_cell.length_b'][0]) c = float(d_mmCIF['_cell.length_c'][0]) alpha = float(d_mmCIF['_cell.angle_alpha'][0]) beta = float(d_mmCIF['_cell.angle_beta'][0]) gamma = float(d_mmCIF['_cell.angle_gamma'][0]) Z = int(d_mmCIF['_cell.Z_PDB'][0]) mw = 0 for i in range(len(d_mmCIF['_entity.id'])): ## if d_mmCIF['_entity.type'][i] == 'polymer': s = d_mmCIF['_entity.formula_weight'][i] ## unknown ligand if s == '?': continue mw += float(s) MV = matthews_coefficient.main(a,b,c,alpha,beta,gamma,mw,Z) spacegroup = ''.join(d_mmCIF['_symmetry.space_group_name_H-M']) if spacegroup not in [ 'F 4 3 2', 'F 41 3 2', 'I 41 3 2', ]: continue ## tmp!!! if MV > 10: print pdb print 'mw', mw print 'MV', MV, d_mmCIF['_exptl_crystal.density_Matthews'] print 'Z', Z import math alpha *= math.pi/180. beta *= math.pi/180. gamma *= math.pi/180. V = a*b*c*math.sqrt(1-math.cos(alpha)**2-math.cos(beta)**2-math.cos(gamma)**2+2*(math.cos(alpha)*math.cos(beta)*math.cos(gamma))) print 'V', V continue stop_treshold stop if '_exptl_crystal.density_Matthews' in d_mmCIF.keys(): if d_mmCIF['_exptl_crystal.density_Matthews'] not in [['?'],len(d_mmCIF['_exptl_crystal.density_Matthews'])*['?'],]: if abs(MV-float(d_mmCIF['_exptl_crystal.density_Matthews'][0])) > 1: print 'MV', MV print 'MV', d_mmCIF['_exptl_crystal.density_Matthews'] print 'mw', mw print 'Z', Z continue stop_difference if not spacegroup in d_MV.keys(): d_MV[spacegroup] = [] d_MV[spacegroup] += [MV] print pdb, round(MV,2), spacegroup ## fd = open('MV_v_spacegroup.txt','w') ## fd.write(str(d_MV)) ## fd.close() l = ['# MV_average MV_stddev n spacegroup\n'] for spacegroup in d_MV.keys(): l_MV = d_MV[spacegroup] if len(l_MV) <= 1: continue average, stddev = statistics.do_stddev(l_MV) average, stderr = statistics.do_stderr(l_MV) ## l += ['%s %s %s %s\n' %(average,stddev,len(l_MV),spacegroup,)] l += ['%s %s %s %s\n' %(average,stderr,len(l_MV),spacegroup,)] fd = open('MV_v_spacegroup.txt','w') fd.writelines(l) fd.close() return
def main(): fd = open('db_authors.txt','r') lines = fd.readlines() fd.close() d_authors = {} for line in lines: l = line.strip().split() pdb = l[0] s_authors = l[1:] d_authors[pdb] = s_authors lines_out = [] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' %(path,dn)): continue print '%s/%s %s' %(i+1,len(l_dns), dn) l_fns = os.listdir('%s/%s' %(path,dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d_authors.keys(): continue print pdb fd = open('%s/%s/%s' %(path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb,lines, l_data_categories = [ '_audit_author', '_citation_author', ], ## parse selected data categories l_data_categories_break = [ '_citation_author', ], ) l_authors = d_mmCIF['_audit_author.name'] s_authors = ';'.join(l_authors) if d_mmCIF['_audit_author.name'] == []: print d_mmCIF['_citation_author.name'] print d_mmCIF['_audit_author.name'] stop line = '%s %s\n' %(pdb,s_authors,) lines_out += [line] fd = open('db_authors.txt','a') fd.write(line) fd.close() d_authors[pdb] = s_authors ## ## write to file ## lines_out = [] for pdb,s_authors in d_authors.items(): line = '%s %s\n' %(pdb,s_authors,) lines_out += [line] fd = open('db_authors.txt','w') fd.writelines(lines_out) fd.close() return
def main(): fd = open('remediation_negativeBiso.txt','r') lines = fd.readlines() fd.close() l_pdbs = [] for line in lines: if line.strip() == '': continue if line[0] == '#': continue l = line.strip().split() pdb = l[0] l_pdbs += [pdb] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' %(path,dn)): continue print '%s/%s %s' %(i+1,len(l_dns), dn) l_fns = os.listdir('%s/%s' %(path,dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if not pdb in l_pdbs: continue print pdb fd = open('%s/%s/%s' %(path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb,lines, d_breaks_negation = { ## break if not x-ray diffraction '_exptl.method':'X-RAY DIFFRACTION', }, l_data_categories = [ ## parse selected data categories '_database_PDB_rev', '_computing', '_atom_site', '_refine' ], ) ## ## no polymers in structure? ## if not '_entity_poly.entity_id' in d_mmCIF.keys(): ## continue if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue print pdb ## ## parse bfactors ## for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): bfactor = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site]) ## if bfactor == '?': ## continue element = d_mmCIF['_atom_site.type_symbol'][i_atom_site] comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site] if float(bfactor) < -0.01: if ( element != 'H' and comp_id in ['ALA','CYS','ASP','GLU','PHE','GLY','HIS','ILE','LYS','MET','ASN','PRO','GLN','ARG','SER','THR','VAL','TRP','TYR',] ): print print 'negative' print year = int(d_mmCIF['_database_PDB_rev.date'][0][:4]) atom_id = int(d_mmCIF['_atom_site.id'][i_atom_site]) refinement = ''.join(d_mmCIF['_computing.structure_refinement']) solution = ''.join(d_mmCIF['_computing.structure_solution']) resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high'])) fd = open('remediation_negativeBiso.txt','a') fd.write( ## '%4s %4i %4i %3s %2s %6.2f %30s %20s\n' %( '%4s\t%4i\t%4i\t%3s\t%2s\t%6.2f\t%6.2f\t%30s\t%20s\n' %( pdb,year,atom_id, comp_id,element,bfactor,resolution, solution.ljust(30),refinement.ljust(20), ) ) fd.close() break return
def parse_dihedrals(): import sys path = '/data/mmCIF' d_phipsi_res = { 'ALA': [], 'CYS': [], 'ASP': [], 'GLU': [], 'PHE': [], 'GLY': [], 'HIS': [], 'ILE': [], 'LYS': [], 'LEU': [], 'MET': [], 'ASN': [], 'PRO': [], 'GLN': [], 'ARG': [], 'SER': [], 'THR': [], 'VAL': [], 'TRP': [], 'TYR': [], 'prePRO': [], 'prePRO_notGLY': [], 'prePRO_GLY': [], 'cisPro': [], 'transPro': [], 'all_notgly_notpro_notprepro': [], } d_phipsi_ss = { 'sheet': [], ## _struct_sheet_order.sense ##_struct_conf.pdbx_PDB_helix_class 'helix_alpha': [], ## i+4 # 1 'helix_pi': [], ## i+5 # 3 'helix_310': [], ## i+3 # 5 'Turn': [], ## i+? ## 'turns_notgly_notpro_notprepro': [], } d_counts = { 'cisProALA': 0, 'cisProCYS': 0, 'cisProASP': 0, 'cisProGLU': 0, 'cisProPHE': 0, 'cisProGLY': 0, 'cisProHIS': 0, 'cisProILE': 0, 'cisProLYS': 0, 'cisProLEU': 0, 'cisProMET': 0, 'cisProASN': 0, 'cisProPRO': 0, 'cisProGLN': 0, 'cisProARG': 0, 'cisProSER': 0, 'cisProTHR': 0, 'cisProVAL': 0, 'cisProTRP': 0, 'cisProTYR': 0, 'cisPro_helix': 0, 'cisPro_sheet': 0, 'cisPro_turn': 0, 'cisPro_random': 0, } l_dn = os.listdir(path) l_dn.sort() l_dn.remove('mmCIF.py') for dn in l_dn: if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue print '*', dn l_fn = os.listdir('%s/%s' % ( path, dn, )) l_fn.sort() for fn in l_fn: pdb = fn[:4] print pdb d_mmCIF = parse_mmCIF.main( pdb, d_breaks={'_exptl.method': ['SOLUTION NMR']}, l_data_categories=[ '_exptl', '_refine', '_struct_conf', ## HELIX '_struct_sheet_range', ## SHEET '_entity', '_entity_poly', '_entity_poly_seq', '_atom_site', ], ) ## skip NMR models if ''.join(d_mmCIF['_exptl.method']) in [ 'SOLUTION NMR', 'POWDER DIFFRACTION', 'ELECTRON MICROSCOPY', ]: continue if not '_refine.ls_d_res_high' in d_mmCIF.keys(): print d_mmCIF['_exptl.method'] continue ## skip if multiple resolutions if len(d_mmCIF['_refine.ls_d_res_high']) > 1: continue ## skip if no resolution if ''.join(d_mmCIF['_refine.ls_d_res_high']) == '?': continue ## skip low resolution structures if float(''.join(d_mmCIF['_refine.ls_d_res_high'])) > 2: continue if not 'polymer' in d_mmCIF['_entity.type']: continue if not '_entity_poly.type' in d_mmCIF.keys(): ## e.g. 1hhu continue if d_mmCIF['_entity_poly.type'] == [ 'polydeoxyribonucleotide/polyribonucleotide hybrid' ]: continue if d_mmCIF['_entity_poly.type'] == ['polydeoxyribonucleotide']: continue d_sequence = {} for i_entity_poly_seq in range( len(d_mmCIF['_entity_poly_seq.entity_id'])): entity_id = int( d_mmCIF['_entity_poly_seq.entity_id'][i_entity_poly_seq]) if not entity_id in d_sequence.keys(): d_sequence[entity_id] = [] res_no = int( d_mmCIF['_entity_poly_seq.num'][i_entity_poly_seq]) res_name = d_mmCIF['_entity_poly_seq.mon_id'][ i_entity_poly_seq] d_sequence[entity_id] += [{ 'res_no': res_no, 'res_name': res_name, }] l_entities_poly = [] for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])): ## skip if not polypeptide entity_poly_type = d_mmCIF['_entity_poly.type'][i_entity_poly] if entity_poly_type != 'polypeptide(L)': continue ## skip if nonstd linkages if d_mmCIF['_entity_poly.nstd_linkage'][ i_entity_poly] == 'yes': print pdb stop continue ## parse entity_id and chains entity_id = int( d_mmCIF['_entity_poly.entity_id'][i_entity_poly]) l_entities_poly += [entity_id] ## skip if no polypeptide chains if l_entities_poly == []: continue d_coords = {} for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): entity_id = int( d_mmCIF['_atom_site.label_entity_id'][i_atom_site]) ## not a polymer if not entity_id in l_entities_poly: continue ## polymer, append elif not entity_id in d_coords.keys(): d_coords[entity_id] = {} model = int( d_mmCIF['_atom_site.pdbx_PDB_model_num'][i_atom_site]) if model > 1: continue chain = d_mmCIF['_atom_site.label_asym_id'][i_atom_site] if not chain in d_coords[entity_id].keys(): d_coords[entity_id][chain] = {} res_no = int(d_mmCIF['_atom_site.label_seq_id'][i_atom_site]) if not res_no in d_coords[entity_id][chain].keys(): d_coords[entity_id][chain][res_no] = {} atom_name = d_mmCIF['_atom_site.label_atom_id'][i_atom_site] altloc = d_mmCIF['_atom_site.label_alt_id'][i_atom_site] if altloc not in [ '.', 'A', '1', ]: continue ## skip if zero occupancy occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site]) if altloc == '.' and occupancy == 0: continue if atom_name in [ 'CA', 'C', 'O', 'N', ] and atom_name in d_coords[entity_id][chain][res_no].keys(): print pdb, chain, res_no, atom_name print d_mmCIF['_atom_site.Cartn_x'][i_atom_site], d_mmCIF[ '_atom_site.Cartn_y'][i_atom_site] print d_coords[entity_id][chain][res_no][atom_name] stop x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site]) y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site]) z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site]) coord = numpy.array([ x, y, z, ]) d_coords[entity_id][chain][res_no][atom_name] = coord d_helices = {} ## helices or turns present? if '_struct_conf.id' in d_mmCIF.keys(): for i_struct_conf in range(len(d_mmCIF['_struct_conf.id'])): chain1 = d_mmCIF['_struct_conf.beg_label_asym_id'][ i_struct_conf] chain2 = d_mmCIF['_struct_conf.end_label_asym_id'][ i_struct_conf] res_no1 = int(d_mmCIF['_struct_conf.beg_label_seq_id'] [i_struct_conf]) res_no2 = int(d_mmCIF['_struct_conf.end_label_seq_id'] [i_struct_conf]) conf_type_id = d_mmCIF['_struct_conf.conf_type_id'][ i_struct_conf] if chain1 != chain2: print chain1, chain2, pdb stop if conf_type_id == 'HELX_P': helix_class = int( d_mmCIF['_struct_conf.pdbx_PDB_helix_class'] [i_struct_conf]) elif conf_type_id == 'TURN_P': helix_class = 99 else: print conf_type_id print pdb stop l_res_nos = range( res_no1, res_no2 + 1, ) if not chain1 in d_helices.keys(): d_helices[chain1] = {} for res_no in l_res_nos: d_helices[chain1][res_no] = helix_class d_sheets = {} ## sheet present? if '_struct_sheet_range.sheet_id' in d_mmCIF.keys(): for i_struct_sheet_range in range( len(d_mmCIF['_struct_sheet_range.sheet_id'])): chain1 = d_mmCIF['_struct_sheet_range.beg_label_asym_id'][ i_struct_sheet_range] chain2 = d_mmCIF['_struct_sheet_range.end_label_asym_id'][ i_struct_sheet_range] res_no1 = int( d_mmCIF['_struct_sheet_range.beg_label_seq_id'] [i_struct_sheet_range]) res_no2 = int( d_mmCIF['_struct_sheet_range.end_label_seq_id'] [i_struct_sheet_range]) l_res_nos = range( res_no1, res_no2 + 1, ) if chain1 != chain2: print chain1, chain2, pdb stop if not chain1 in d_sheets.keys(): d_sheets[chain1] = [] for res_no in l_res_nos: d_sheets[chain1] += l_res_nos for entity_id in l_entities_poly: for chain in d_coords[entity_id].keys(): ## skip if short peptide (e.g. 13gs) if len(d_sequence[entity_id]) <= 3: continue for i_res_no in range(1, len(d_sequence[entity_id]) - 1): res_no_prev = int(d_sequence[entity_id][i_res_no - 1]['res_no']) res_no = int(d_sequence[entity_id][i_res_no]['res_no']) res_no_next = int(d_sequence[entity_id][i_res_no + 1]['res_no']) res_name = d_sequence[entity_id][i_res_no]['res_name'] if res_name == 'MSE': res_name = 'MET' res_name_next = d_sequence[entity_id][i_res_no + 1]['res_name'] ## not a standard residue if not res_name in d_phipsi_res.keys(): continue ## residue not observed if not res_no_prev in d_coords[entity_id][chain].keys( ): continue if not res_no in d_coords[entity_id][chain].keys(): continue if not res_no_next in d_coords[entity_id][chain].keys( ): continue ## atom not observed if not 'C' in d_coords[entity_id][chain][res_no_prev]: continue if not 'N' in d_coords[entity_id][chain][res_no]: continue if not 'CA' in d_coords[entity_id][chain][res_no]: continue if not 'C' in d_coords[entity_id][chain][res_no]: continue if not 'N' in d_coords[entity_id][chain][res_no_next]: continue C_prev = d_coords[entity_id][chain][res_no_prev]['C'] N = d_coords[entity_id][chain][res_no]['N'] CA = d_coords[entity_id][chain][res_no]['CA'] C = d_coords[entity_id][chain][res_no]['C'] N_next = d_coords[entity_id][chain][res_no_next]['N'] phi = calc_dihedral( C_prev, N, CA, C, ) psi = calc_dihedral( N, CA, C, N_next, ) if 'CA' in d_coords[entity_id][chain][ res_no_prev].keys(): CA_prev = d_coords[entity_id][chain][res_no_prev][ 'CA'] omega = calc_dihedral( CA_prev, C_prev, N, CA, ) else: omega = None if omega: if (omega and omega < 150 and omega > -150): ## 12e8, PRO44D if abs(omega ) > 30: ## 12e8 PRO196D, 1a44 GLU82A omega = None ## cis else: omega = 'cis' pass ## trans else: omega = 'trans' pass else: omega = None bool_helix = False if chain in d_helices.keys(): if res_no in d_helices[chain].keys(): bool_helix = True helix_class = d_helices[chain][res_no] bool_sheet = False if chain in d_sheets.keys(): if res_no in d_sheets[chain]: bool_sheet = True ## if bool_helix == True and bool_sheet == True and helix_class != 99: ## print pdb, chain, res_no, 'sheet and helix' #### stop if res_name_next == 'PRO': d_phipsi_res['prePRO'] += [[ phi, psi, ]] if res_name != 'GLY': d_phipsi_res['prePRO_notGLY'] += [[ phi, psi, ]] else: d_phipsi_res['prePRO_GLY'] += [[ phi, psi, ]] else: d_phipsi_res[res_name] += [[ phi, psi, ]] if res_name not in [ 'GLY', 'PRO', ]: d_phipsi_res[ 'all_notgly_notpro_notprepro'] += [[ phi, psi, ]] elif res_name == 'PRO' and omega: d_phipsi_res['%sPro' % (omega)] += [[ phi, psi, ]] if omega == 'cis': d_counts['cisPro%s' % (res_name)] += 1 if bool_helix == True: if helix_class == 1: d_counts['cisPro_helix'] += 1 elif helix_class == 99: d_counts['cisPro_turn'] += 99 elif bool_sheet == True: d_counts['cisPro_sheet'] += 1 else: d_counts['cisPro_random'] += 1 if bool_helix == True: ## if helix_class not in [1,3,5,99,]: ## print pdb, chain, res_no, helix_class ## print 'unexpected helix class' #### stop_helix_class if helix_class == 1: d_phipsi_ss['helix_alpha'] += [[ phi, psi, ]] elif helix_class == 3: d_phipsi_ss['helix_pi'] += [[ phi, psi, ]] elif helix_class == 5: d_phipsi_ss['helix_310'] += [[ phi, psi, ]] elif helix_class == 99: d_phipsi_ss['Turn'] += [[ phi, psi, ]] if (res_name_next != 'PRO' and res_name not in [ 'GLY', 'PRO', ]): d_phipsi_ss[ 'turns_notgly_notpro_notprepro'] += [[ phi, psi, ]] if bool_sheet == True: d_phipsi_ss['sheet'] += [[ phi, psi, ]] l = [] for k in d_counts.keys(): count = d_counts[k] l += ['%s %s\n' % ( k, count, )] fd = open('count.txt', 'w') fd.writelines(l) fd.close() return d_phipsi_res, d_phipsi_ss
def main(): l_pdbs = [] fd = open('Biso_v_resolution.gnuplotdata','r') lines = fd.readlines() fd.close() for line in lines: l = line.split() resolution = float(l[1]) Biso = float(l[0]) if resolution > 3.5 and Biso < 10: print line if resolution > 2.5 and Biso < 10: print line if resolution > 2.0 and Biso < 5: print line ## if resolution > 1.5 and Biso < 5: ## print line pdb = l[2] l_pdbs += [pdb] Biso_average_prev = 0 l_dn = os.listdir(path) l_dn.sort() for dn in l_dn: if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue if not os.path.isdir('%s/%s' %(path,dn)): continue l_fns = os.listdir('%s/%s' %(path,dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in l_pdbs: continue if pdb in [ '3bfn', ## PISA left out chains from biological unit '2jjg','1qjb', ## _pdbx_struct_assembly missing ]: continue ## ## parse header ## d_mmCIF = parse_mmCIF.main( pdb, l_data_categories = [ '_pdbx_struct_assembly', '_entity_poly', '_citation', '_pdbx_database_related', ], ## parse selected data categories d_breaks_negation = { ## break if not x-ray diffraction '_exptl.method':'X-RAY DIFFRACTION', } ) if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']: continue if '_pdbx_database_related.content_type' in d_mmCIF.keys(): if 'split' in d_mmCIF['_pdbx_database_related.content_type']: continue try: if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != ['monomeric']: continue except: print pdb stop if not '_citation.id' in d_mmCIF.keys(): continue ## ## parse coordinate section ## d_mmCIF = parse_mmCIF.main( pdb, l_data_categories = [ '_database_PDB_rev', '_refine', '_refine_hist', '_atom_site', '_software', '_entity','_entity_poly', '_pdbx_struct_assembly', '_pdbx_database_status', ], ## parse selected data categories d_breaks_negation = { ## break if not x-ray diffraction '_exptl.method':'X-RAY DIFFRACTION', } ) if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']: continue if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != ['monomeric']: continue resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high'])) if ( int(d_mmCIF['_entity.pdbx_number_of_molecules'][0]) != 1 or len(d_mmCIF['_entity_poly.pdbx_strand_id']) > 1 or len(''.join(d_mmCIF['_entity_poly.pdbx_strand_id']).split(',')) > 1 or len(d_mmCIF['_entity_poly.entity_id']) > 1 ): print pdb print d_mmCIF['_entity.pdbx_number_of_molecules'] print d_mmCIF['_entity_poly.pdbx_strand_id'] stop entity_poly_id = int(''.join(d_mmCIF['_entity_poly.entity_id'])) for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])): entity_poly_id = d_mmCIF['_entity_poly.entity_id'][i_entity_poly] entity_poly_type = d_mmCIF['_entity_poly.entity_id'][i_entity_poly] l_Biso = [] for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site]) if occupancy != 1: continue alt_id = d_mmCIF['_atom_site.label_alt_id'][i_atom_site] if alt_id != '.': continue entity_id = d_mmCIF['_atom_site.label_entity_id'][i_atom_site] if entity_id != entity_poly_id: continue comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site] if not comp_id in ['MSE','ALA','CYS','ASP','GLU','PHE','GLY','HIS','ILE','LYS','LEU','MET','ASN','PRO','GLN','ARG','SER','THR','VAL','TRP','TYR',]: continue type_symbol = d_mmCIF['_atom_site.type_symbol'][i_atom_site] if type_symbol == 'H': continue atom_id = d_mmCIF['_atom_site.label_atom_id'][i_atom_site] if not atom_id in ['N','CA','C',]: continue Biso = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site]) l_Biso += [Biso] year = int(d_mmCIF['_database_PDB_rev.date'][0][:4]) site = ''.join(d_mmCIF['_pdbx_database_status.process_site']) if len(l_Biso) == 0: continue ## if l_Biso == len(l_Biso)*[l_Biso[0]]: ## print pdb, year, l_Biso[0:3] ## if year >= 2010: ## stop ## continue Biso_average = sum(l_Biso)/len(l_Biso) bool_continue = False for Biso in set(l_Biso): count = l_Biso.count(Biso) if count > 20: if '_software.name' in d_mmCIF.keys(): print pdb, Biso_average, Biso, count, d_mmCIF['_software.name'] s = '%s %6.2f %4i %6.2f %4i %s %s\n' %( pdb,Biso,count,Biso_average,year,site, str(d_mmCIF['_software.name']), ) else: print pdb, Biso_average, Biso, count s = '%s %6.2f %4i %6.2f %4i %s\n' %( pdb,Biso,count,Biso_average, year, site, ) bool_continue = True fd = open('remediation_Biso_duplicates.txt','a') fd.write(s) fd.close() break if bool_continue == True: continue ## if Biso_average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]: if Biso_average in range(0,100+1): print l_Biso print Biso_average print pdb print year stop if '_refine.pdbx_TLS_residual_ADP_flag' in d_mmCIF.keys(): if ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in ['UNVERIFIED','LIKELY RESIDUAL',]: continue elif ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in ['?',]: pass else: print d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag'] print pdb, Biso_average stop if round(Biso_average,4) == round(Biso_average_prev,4): print pdb, Biso_average, Biso_average_prev stop print pdb, round(Biso_average,2), resolution fd = open('Biso_v_resolution.gnuplotdata','a') fd.write('%s %s %s %s\n' %(Biso_average,resolution,pdb,year,)) fd.close() plot()
import matthews_coefficient, parse_mmCIF for pdb in [ '2hhb', '1hho', '1hv4', '3hl9', '3hlb', '3hlc', '3hld', '3hle', '3hlf', '3hlg', ]: d_mmCIF = parse_mmCIF.main(pdb) a = float(d_mmCIF['_cell.length_a'][0]) b = float(d_mmCIF['_cell.length_b'][0]) c = float(d_mmCIF['_cell.length_c'][0]) alpha = float(d_mmCIF['_cell.angle_alpha'][0]) beta = float(d_mmCIF['_cell.angle_beta'][0]) gamma = float(d_mmCIF['_cell.angle_gamma'][0]) Z = int(d_mmCIF['_cell.Z_PDB'][0]) ## number of polymers in unit cell mw = 0 for i in range(len(d_mmCIF['_entity.id'])): if d_mmCIF['_entity.type'][i] == 'polymer': mw += float(d_mmCIF['_entity.formula_weight'][i]) MV = matthews_coefficient.main( a, b,
dn, )) l_fn.sort() for fn in l_fn: pdb = fn[:4] if fn[-3:] == '.gz': continue ######## if pdb in ['2fl9','3gau','3gav','3gaw',]: ## tmp!!! ######## continue ## print pdb fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() d = parse_mmCIF.main( pdb, lines, l_data_categories=l_data_categories, d_breaks=d_breaks, ) if d_exclude_subset: bool_continue = False for item_exclude, l_values_exclude in d_exclude_subset.items(): if not item_exclude in d.keys(): bool_continue = True fd = open('%s/remediation_%s.txt' % ( path, item_exclude, ), 'a') fd.write('%s\n' % (pdb)) fd.close() continue
def parse_cifs( l_pdbs, ref_seq, l_db_codes, n_mutations_max, resolution_min, bool_multiple_entities = False, ): print 'parse cifs' n_mutants = 0 l_wts = [] l_wts_cysfree = [] d_mutants = {} d_mmCIF_main = {} for pdb in l_pdbs: if pdb[:4].lower() in d_mmCIF_main.keys(): continue d_mmCIF = parse_mmCIF.main(pdb[:4].lower(),) ## not an x-ray structure if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: print pdb, d_mmCIF['_exptl.method'] continue ## more than one type of polymer present n_entities = len(d_mmCIF['_entity_poly.entity_id']) if bool_multiple_entities == False: if n_entities > 1: print pdb, 'entities', n_entities #, d_mmCIF['_struct.title'] continue ## low resolution if d_mmCIF['_refine.ls_d_res_high'] != d_mmCIF['_refine_hist.d_res_high']: print d_mmCIF['_refine.ls_d_res_high'] print d_mmCIF['_refine_hist.d_res_high'] stop if resolution_min: ## if float(d_mmCIF['_refine.ls_d_res_high'][0]) >= resolution_min: if float(d_mmCIF['_refine.ls_d_res_high'][0]) > resolution_min: print pdb, 'resolution', d_mmCIF['_refine.ls_d_res_high'] continue ## get entity ID from chain ID for i_entity in range(len(d_mmCIF['_entity_poly.entity_id'])): entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity] s_chain_ids = d_mmCIF['_entity_poly.pdbx_strand_id'][i_entity] if pdb[-1] in s_chain_ids: break if pdb[-1] not in s_chain_ids: print pdb print s_chain_ids stop ## get sequence from entity ID seq = [] for i in range(len(d_mmCIF['_entity_poly_seq.entity_id'])): if d_mmCIF['_entity_poly_seq.entity_id'][i] == entity_id: mon_id = d_mmCIF['_entity_poly_seq.mon_id'][i] if pdb[:4] == '1RCM' and i == 126: if mon_id != 'CYS': stop mon_id = 'CCS' seq += [mon_id] ## wrong chain length if ref_seq: if len(seq) != len(ref_seq): if ''.join(ref_seq) in ''.join(seq): print ref_seq print seq stop ## unobserved atoms not in seqres elif ''.join(seq) in ''.join(ref_seq): pass ## last two residues unobserved elif len(seq) == 162 and pdb in [ '1KS3_A','1KW5_A','1KW7_A','1KY0_A','1KY1_A','1L0J_A','1LOK_A','1LPY_A','1LW9_A','1LWG_A','1LWK_A', ]: pass ## last two residues unobserved elif len(seq) == 162 and seq[-1] == 'LYS': pass else: print pdb, 'seqlen', len(seq) continue ## not from Gallus gallus ## check not necessary, because sequence checked against ref seq entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity] db_code = d_mmCIF['_struct_ref.db_code'][d_mmCIF['_struct_ref.entity_id'].index(entity_id)] if db_code not in l_db_codes: print pdb, 'uniprot', db_code continue ## more than 1 mutation? if n_mutations_max != None: l_mutations = [] for i_seq in range(len(seq)): res_id_mmCIF = seq[i_seq] res_id_uniprot = ref_seq[i_seq] if res_id_mmCIF != res_id_uniprot: l_mutations += ['%3s%i%3s' %(res_id_uniprot,i_seq+1,res_id_mmCIF,)] ## if len(l_mutations) == 1: if len(l_mutations) > n_mutations_max: print pdb, 'muts', len(l_mutations) continue elif len(l_mutations) > 0: n_mutants += 1 startmodel = parse_mmCIF_item(d_mmCIF,'_refine.pdbx_starting_model',pdb,) ## append to lists and dictionaries d_mmCIF_main[pdb[:4]] = d_mmCIF if len(l_mutations) > 0: if l_mutations == ['CYS54THR', 'CYS97ALA']: l_wts_cysfree += [pdb] d_mutants[pdb] = {'mutations':l_mutations,'startmodel':startmodel} else: l_wts += [pdb] ## print 'd_mutants', d_mutants return d_mmCIF_main, l_wts, d_mutants, l_wts_cysfree
'1u3fA', '1agyA', '1zioA', '1pa9A', '2tpsA', '2plcA', '1qk2A', '1j53A', '1m21A', ] cutoff = 10 for pdb in l_pdbs: pdb = pdb[:4] d = parse_mmCIF.main(pdb, ) d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain=pdb[4:]) matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose=False) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, verbose=False) visualization.vmd_arrows(pdb, l_coords, eigenvectors) print pdb stop
'1czfA', '1thgA', '1booA', '1iu4A', '1bqcA', '206lA', '1cdeA', '1snzA', '1gq8A', '1aqlA', '1ps1A', '1s95A', '1pylA', '1ra2A', '1b6bA', '1pntA', '1e1aA', '2f9rA', '1v04A', '2nlrA', '1n29A', '1pbgA', '5cpaA', '1agmA', '1byaA', '1r76A', '1u5uA', '1vidA', '1h4gA', '1akdA', '1fy2A', '1xqdA', '1d6oA', '1qv0A', '1qjeA', '1fvaA', '1bp2A', '1ah7A', '2pthA', '2engA', '2acyA', '1qazA', '2a0nA', '1dl2A', '1gp5A', '1onrA', '1cwyA', '1pudA', '1bs9A', '1dinA', '1xyzA', '1bwlA', '1eugA', '1idjA', '1g24A', '1oygA', '1hzfA', '9papA', '1eb6A', '1ghsA', '1rbnA', '1bixA', '1bs4A', '1celA', '1hkaA', '1b02A', '1qibA', '1u3fA', '1agyA', '1zioA', '1pa9A', '2tpsA', '2plcA', '1qk2A', '1j53A', '1m21A', ] cutoff = 10 for pdb in l_pdbs: pdb = pdb[:4] d = parse_mmCIF.main(pdb,) d_coords, l_coords = mmCIF2coords.main(pdb, d, query_chain = pdb[4:]) matrix_hessian = NMA.hessian_calculation(l_coords, cutoff, verbose = False) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, verbose = False) visualization.vmd_arrows(pdb, l_coords, eigenvectors) print pdb stop
def unobs_nonterminal_atoms_alpha(): ## this method is not entirely correct... e.g. 1kwr... category = fn = '_pdbx_unobs_or_zero_occ_atoms' fd = open('%s/list%s.txt' %(path,fn)) s = fd.read() fd.close() l_pdbs_include = s.split() ## if a whole residue is missing, then all of it's atoms are also missing fd = open('%s/list_pdbx_unobs_residues__NONTERMINAL.txt' %(path)) s = fd.read() fd.close() l_pdbs_exclude = s.split() l_data_categories = [ '_pdbx_poly_seq_scheme', '_pdbx_unobs_or_zero_occ_atoms', '_entity_poly', '_struct', ## .pdbx_model_type_details '_exptl', ] d_breaks = {'_exptl.method':['SOLUTION NMR','SOLID-STATE NMR']} fn_out = 'list_pdbx_unobs_atoms__CA.txt' l_pdbs_out = [] for pdb in l_pdbs_include: ## if pdb[1:3] < 'fe': ## continue ## if pdb == '2kzt': ## takes too long... ## continue if pdb != '3e3d': continue if pdb in l_pdbs_exclude: continue print pdb d = parse_mmCIF.main(pdb,l_data_categories=l_data_categories,d_breaks=d_breaks,) ## something has to be missing in the first place for it to be terminal/nonterminal if not category in d.keys(): continue ## it has to be a polymer in the first place for anything to be terminal/nonterminal if not '_pdbx_poly_seq_scheme' in d.keys(): continue ## don't deal with NMR models for now... (too many unobs records when hydrogen...) if d['_exptl.method'] != ['X-RAY DIFFRACTION']: continue if '_struct.pdbx_model_type_details' in d.keys(): if d['_struct.pdbx_model_type_details'] in [ ['?'], ['minimized average'], ['MINIMIZED AVERAGE'], ]: pass ## if residues are not missing, and model is CA only, then no CA are missing!!! elif 'CA ATOMS ONLY' in d['_struct.pdbx_model_type_details'][0]: continue else: print d['_struct.pdbx_model_type_details'] stop ## if not 'CA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']: ## continue bool_append = False for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])): if ( d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] == 'CA' and d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'][i_unobs] == 'Y' and ## unobs (1), zero_occ (0) d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs] == '1' ): l_pdbs_out += [pdb] print '***', pdb break continue print l_pdbs_out stop fd = open('%s/%s' %(path,fn_out,),'w') fd.write('\n'.join(l_pdbs_out)) fd.close() for x in []: l_indexes_unobs = [] bool_append = False s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id']) for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])): ## skip if not alpha carbon if d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] != 'CA': continue ## skip if zero occupancy if d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs] == '0': continue if 'HA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']: print pdb print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id']) stop2 if d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'].count('Y') > 800: print pdb print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id']) stop1 asymID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_asym_id'][i_unobs] seqID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'][i_unobs] index1 = s.index(asymID_unobs) index2 = s.rindex(asymID_unobs)+1 for i_poly in range(index1,index2,): asymID_poly = d['_pdbx_poly_seq_scheme.pdb_strand_id'][i_poly] seqID_poly = d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly] if seqID_poly == seqID_unobs: if d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == '.' and d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] == '?': pass elif d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs]: pass elif d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] == '?' and d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] != '.': continue elif not d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][i_unobs] in d['_pdbx_poly_seq_scheme.pdb_ins_code']: print d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] print insCode_unobs print pdb print seqID_unobs, asymID_unobs stop else: continue if asymID_unobs != asymID_poly: stop_add_with_check_of_identiiical_seqID ## tmp!!! check!!! if d['_pdbx_unobs_or_zero_occ_atoms.auth_comp_id'][i_unobs] != d['_pdbx_poly_seq_scheme.pdb_mon_id'][i_poly]: print pdb stop ## ## last residue ## if index2-i_poly == 0: ## pass ## should append... ## ## first residue ## elif i_poly-index1 == 0: ## pass ## should append... #### elif i_poly-index1 > 1 and bool_unobs_prev == False: #### bool_append = True ## previous residues are missing elif d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:i_poly] == (i_poly-index1)*['?']: bool_append = True ## next residues are missing elif d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly+1:index2] == (index2-i_poly-1)*['?']: bool_append = True ## zero occupancy residue prior to residue with unobserved atom(s) elif pdb in ['7adh']: bool_append = False pass else: if len( set(range(index1,i_poly)) - set(l_indexes_unobs) ) == 0: l_indexes_unobs += [i_poly] stop1 pass elif len( set(range(i_poly+1,index2)) - set(l_indexes_unobs) ) == 0: l_indexes_unobs += [i_poly] print pdb print l_indexes_unobs print i_poly, index1, index2 stop2 pass else: ## this method is not entirely correct... e.g. 1kwr... if i_poly-index1 < 10 or index2-i_poly < 10: print pdb print i_poly-index1 ## print index2-i_poly print seqID_unobs print pdb print d['_pdbx_poly_seq_scheme.auth_seq_num'][index1:i_poly] print d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly:index2] print pdb ## stop bool_append = True break if bool_append == True: break if bool_append == True: print pdb l_pdbs_out += [pdb] continue if l_indexes_unobs != []: print l_indexes_unobs stop fd = open('%s/%s' %(path,fn_out,),'w') fd.write('\n'.join(l_pdbs_out)) fd.close() return
def main(): l_fn_out = [ '_exptl_crystal_grow', '_exptl_crystal_grow_comp', ] d = {} for fn_out in l_fn_out: fd = open('db%s.txt' %(fn_out),'r') lines = fd.readlines() fd.close() d[fn_out] = {} for line in lines: if line == '\n': continue pdb = line[:4] s = line[5:] d[fn_out][pdb] = s fd = open('remediation_exptl_crystal_grow.pH.txt','r') lines = fd.readlines() fd.close() l_pdbs = [line[:4] for line in lines] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' %(path,dn)): continue print '%s/%s %s' %(i+1,len(l_dns), dn) l_fns = os.listdir('%s/%s' %(path,dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] ## continue if already in txt file from previous attempt to run loop ## if pdb in d['_exptl_crystal_grow_comp'].keys(): ## continue ## if pdb in d['_exptl_crystal_grow'].keys(): ## continue ## print pdb if not pdb in l_pdbs: continue fd = open('%s/%s/%s' %(path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb,lines, d_breaks_negation = { ## break if not x-ray diffraction '_exptl.method':'X-RAY DIFFRACTION', }, l_data_categories_break = [ ## '_atom', '_diffrn', ], l_data_categories = [ ## parse selected data categories '_database_PDB_rev', '_pdbx_database_status', '_exptl', '_exptl_crystal_grow', '_exptl_crystal_grow_comp', ], ) ## ## no polymers in structure? ## if not '_entity_poly.entity_id' in d_mmCIF.keys(): ## continue if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue ## print pdb ## ## ## year = int(d_mmCIF['_database_PDB_rev.date'][0][:4]) process_site = ''.join(d_mmCIF['_pdbx_database_status.process_site']) if ( not '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys() and not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys() ## ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() == '?' ): if process_site != '?': print pdb, year, process_site continue ## if '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys(): s_grow = ' '.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() if ( ## pH not given d_mmCIF['_exptl_crystal_grow.pH'] in [['?'],[''],['.'],] and d_mmCIF['_exptl_crystal_grow.pdbx_pH_range'] in [['?'],[''],['.'],] and ## but pH in growth details ( ' PH ' in s_grow.upper() or 'PH=' in s_grow.upper() or ',PH ' in s_grow.upper() ) ): fd = open('remediation_exptl_crystal_grow.pH.txt','a') fd.write('%s\t%s\t%s\t%4i\t%s\t%s\n' %( pdb, ''.join(d_mmCIF['_exptl_crystal_grow.pH']), ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_pH_range']), year, process_site, s_grow, ) ) fd.close() if ( not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys() or ''.join(d_mmCIF['_exptl_crystal_grow_comp.name']) in ['.','','?',] ): if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys(): name = ''.join(d_mmCIF['_exptl_crystal_grow_comp.name']) else: name = 'N/A' ## ## remove end punctuation ## s = s_grow[:-1]+s_grow[-1].replace('.','') ## split ## l_grow_punctuation = s_grow.upper().split('. ') ## l_grow = l_grow_comma = [s_grow.upper().split(',') for s in l_grow_punctuation] l_grow = s_grow.upper().split(',') ## strip space l_grow = [x.strip() for x in l_grow] ## remove empty if '' in l_grow: l_grow.remove('') ## remove end punctuation l_grow = [x[:-1]+x[-1].replace('.','') for x in l_grow] ## remove selected words from elements of list for x in [ 'CRYSTALS OBTAINED BY CO-CRYSTALLIZATION AT ', 'PROTEIN SOLUTION (', ]: for i_grow in range(len(l_grow)): l_grow[i_grow] = l_grow[i_grow].replace(x,'') ## replace abbreviations for i_grow in range(len(l_grow)): l_grow[i_grow] = l_grow[i_grow].replace('MILLIMOLAR','MM') ## remove selected words from list l_remove = [] for x in [ 'VAPOR DIFFUSION', 'VAPOUR DIFFUSION', 'HANGING DROP', 'SITTING DROP', ]: if x in l_grow: l_remove += [x] ## removed other selected words from list for i_grow in range(len(list(l_grow))): ## remove physical conditions bool_continue = False for x in [ 'TEMPERATURE', 'PH=', 'PH ', 'AT PH ', ]: if l_grow[i_grow][:len(x)] == x: l_remove += [l_grow[i_grow]] bool_continue = True break if bool_continue == True: continue ## remove long words (sentences) if len(l_grow[i_grow]) > 50: l_remove += [l_grow[i_grow]] break for remove in l_remove: l_grow.remove(remove) if len(l_grow) > 0: ## write to file line = '%s\t%s\t%s\t%4i\t%s\t%s\n' %( pdb, name, l_grow, year, process_site, s_grow, ) fd = open('remediation_exptl_crystal_grow_comp.name.txt','a') fd.write(line) fd.close() else: s_grow = '' ## if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys(): l_grow_comp = d_mmCIF['_exptl_crystal_grow_comp.name'] else: l_grow_comp = [] ## lines_out += [line] ## append to txt file in case loop doesn't finish d_lines = {} line = '%s %s\n' %(pdb,s_grow,) d_lines['_exptl_crystal_grow'] = line line = '%s %s\n' %(pdb,l_grow_comp,) d_lines['_exptl_crystal_grow_comp'] = line for fn_out in l_fn_out: fd = open('db%s.txt' %(fn_out),'a') fd.write(d_lines[fn_out]) fd.close() ## append to dic for when loop finishes d['_exptl_crystal_grow'][pdb] = s_grow d['_exptl_crystal_grow_comp'][pdb] = l_grow_comp lines_out = [] for pdb,s in d.items(): line = '%s %s\n' %(pdb,s,) lines_out += [line] fd = open(fn_out,'w') fd.writelines(lines_out) fd.close() return
import sys sys.path.append('/home/tc/svn/tc_sandbox/pdb') import parse_mmCIF, mmCIF2coords sys.path.append('/home/tc/svn/GoodVibes') import NMA, visualization d_mmCIF = parse_mmCIF.main('2lzm', ) d_coords, l_coords_alpha = mmCIF2coords.main('2lzm', d_mmCIF) cutoff = 10 matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian, ) visualization.vmd_trajectory('2lzm', l_coords_alpha, eigenvectors)
def main(): fd = open('db_authors.txt', 'r') lines = fd.readlines() fd.close() d_authors = {} for line in lines: l = line.strip().split() pdb = l[0] s_authors = l[1:] d_authors[pdb] = s_authors lines_out = [] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' % (path, dn)): continue print '%s/%s %s' % (i + 1, len(l_dns), dn) l_fns = os.listdir('%s/%s' % (path, dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d_authors.keys(): continue print pdb fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb, lines, l_data_categories=[ '_audit_author', '_citation_author', ], ## parse selected data categories l_data_categories_break=[ '_citation_author', ], ) l_authors = d_mmCIF['_audit_author.name'] s_authors = ';'.join(l_authors) if d_mmCIF['_audit_author.name'] == []: print d_mmCIF['_citation_author.name'] print d_mmCIF['_audit_author.name'] stop line = '%s %s\n' % ( pdb, s_authors, ) lines_out += [line] fd = open('db_authors.txt', 'a') fd.write(line) fd.close() d_authors[pdb] = s_authors ## ## write to file ## lines_out = [] for pdb, s_authors in d_authors.items(): line = '%s %s\n' % ( pdb, s_authors, ) lines_out += [line] fd = open('db_authors.txt', 'w') fd.writelines(lines_out) fd.close() return
for i_line in range(len(lines)): if i_line % 100 == 0: d_coordinates = {} line = lines[i_line] l = line.split() pdb1 = l[0] pdb2 = l[1] chain1 = l[4] chain2 = l[5] for pdb,chain in [[pdb1,chain1,],[pdb2,chain2,],]: if pdb in d_coordinates.keys(): continue d_mmCIF = parse_mmCIF.main(pdb) if d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num'] != d_mmCIF['_pdbx_poly_seq_scheme.author_seq_num']: print d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num'] print d_mmCIF['_pdbx_poly_seq_scheme.author_seq_num'] stop d_coords = {} d_ndb_seq_num = {} for i_seq in range(len(d_mmCIF['_pdbx_poly_seq_scheme.ndb_seq_num'])): if d_mmCIF['_pdbx_poly_seq_scheme.pdb_strand_id'][i_seq] != chain: continue ndb_seq_num = d_mmCIF['_pdbx_poly_seq_scheme.ndb_seq_num'][i_seq] pdb_seq_num = d_mmCIF['_pdbx_poly_seq_scheme.pdb_seq_num'][i_seq] d_ndb_seq_num[pdb_seq_num] = ndb_seq_num
def unobs_nonterminal_atoms_alpha(): ## this method is not entirely correct... e.g. 1kwr... category = fn = '_pdbx_unobs_or_zero_occ_atoms' fd = open('%s/list%s.txt' % (path, fn)) s = fd.read() fd.close() l_pdbs_include = s.split() ## if a whole residue is missing, then all of it's atoms are also missing fd = open('%s/list_pdbx_unobs_residues__NONTERMINAL.txt' % (path)) s = fd.read() fd.close() l_pdbs_exclude = s.split() l_data_categories = [ '_pdbx_poly_seq_scheme', '_pdbx_unobs_or_zero_occ_atoms', '_entity_poly', '_struct', ## .pdbx_model_type_details '_exptl', ] d_breaks = {'_exptl.method': ['SOLUTION NMR', 'SOLID-STATE NMR']} fn_out = 'list_pdbx_unobs_atoms__CA.txt' l_pdbs_out = [] for pdb in l_pdbs_include: ## if pdb[1:3] < 'fe': ## continue ## if pdb == '2kzt': ## takes too long... ## continue if pdb != '3e3d': continue if pdb in l_pdbs_exclude: continue print pdb d = parse_mmCIF.main( pdb, l_data_categories=l_data_categories, d_breaks=d_breaks, ) ## something has to be missing in the first place for it to be terminal/nonterminal if not category in d.keys(): continue ## it has to be a polymer in the first place for anything to be terminal/nonterminal if not '_pdbx_poly_seq_scheme' in d.keys(): continue ## don't deal with NMR models for now... (too many unobs records when hydrogen...) if d['_exptl.method'] != ['X-RAY DIFFRACTION']: continue if '_struct.pdbx_model_type_details' in d.keys(): if d['_struct.pdbx_model_type_details'] in [ ['?'], ['minimized average'], ['MINIMIZED AVERAGE'], ]: pass ## if residues are not missing, and model is CA only, then no CA are missing!!! elif 'CA ATOMS ONLY' in d['_struct.pdbx_model_type_details'][0]: continue else: print d['_struct.pdbx_model_type_details'] stop ## if not 'CA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']: ## continue bool_append = False for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])): if (d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] == 'CA' and d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'][i_unobs] == 'Y' and ## unobs (1), zero_occ (0) d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][i_unobs] == '1'): l_pdbs_out += [pdb] print '***', pdb break continue print l_pdbs_out stop fd = open('%s/%s' % ( path, fn_out, ), 'w') fd.write('\n'.join(l_pdbs_out)) fd.close() for x in []: l_indexes_unobs = [] bool_append = False s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id']) for i_unobs in range(len(d['_pdbx_unobs_or_zero_occ_atoms'])): ## skip if not alpha carbon if d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id'][i_unobs] != 'CA': continue ## skip if zero occupancy if d['_pdbx_unobs_or_zero_occ_atoms.occupancy_flag'][ i_unobs] == '0': continue if 'HA' in d['_pdbx_unobs_or_zero_occ_atoms.auth_atom_id']: print pdb print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id']) stop2 if d['_pdbx_unobs_or_zero_occ_atoms.polymer_flag'].count( 'Y') > 800: print pdb print len(d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id']) stop1 asymID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_asym_id'][ i_unobs] seqID_unobs = d['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id'][ i_unobs] index1 = s.index(asymID_unobs) index2 = s.rindex(asymID_unobs) + 1 for i_poly in range( index1, index2, ): asymID_poly = d['_pdbx_poly_seq_scheme.pdb_strand_id'][i_poly] seqID_poly = d['_pdbx_poly_seq_scheme.auth_seq_num'][i_poly] if seqID_poly == seqID_unobs: if d['_pdbx_poly_seq_scheme.pdb_ins_code'][ i_poly] == '.' and d[ '_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][ i_unobs] == '?': pass elif d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] == d[ '_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][ i_unobs]: pass elif d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][ i_unobs] == '?' and d[ '_pdbx_poly_seq_scheme.pdb_ins_code'][ i_poly] != '.': continue elif not d['_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code'][ i_unobs] in d['_pdbx_poly_seq_scheme.pdb_ins_code']: print d['_pdbx_poly_seq_scheme.pdb_ins_code'][i_poly] print insCode_unobs print pdb print seqID_unobs, asymID_unobs stop else: continue if asymID_unobs != asymID_poly: stop_add_with_check_of_identiiical_seqID ## tmp!!! check!!! if d['_pdbx_unobs_or_zero_occ_atoms.auth_comp_id'][ i_unobs] != d['_pdbx_poly_seq_scheme.pdb_mon_id'][ i_poly]: print pdb stop ## ## last residue ## if index2-i_poly == 0: ## pass ## should append... ## ## first residue ## elif i_poly-index1 == 0: ## pass ## should append... #### elif i_poly-index1 > 1 and bool_unobs_prev == False: #### bool_append = True ## previous residues are missing elif d['_pdbx_poly_seq_scheme.auth_seq_num'][ index1:i_poly] == (i_poly - index1) * ['?']: bool_append = True ## next residues are missing elif d['_pdbx_poly_seq_scheme.auth_seq_num'][ i_poly + 1:index2] == (index2 - i_poly - 1) * ['?']: bool_append = True ## zero occupancy residue prior to residue with unobserved atom(s) elif pdb in ['7adh']: bool_append = False pass else: if len( set(range(index1, i_poly)) - set(l_indexes_unobs)) == 0: l_indexes_unobs += [i_poly] stop1 pass elif len( set(range(i_poly + 1, index2)) - set(l_indexes_unobs)) == 0: l_indexes_unobs += [i_poly] print pdb print l_indexes_unobs print i_poly, index1, index2 stop2 pass else: ## this method is not entirely correct... e.g. 1kwr... if i_poly - index1 < 10 or index2 - i_poly < 10: print pdb print i_poly - index1 ## print index2-i_poly print seqID_unobs print pdb print d['_pdbx_poly_seq_scheme.auth_seq_num'][ index1:i_poly] print d['_pdbx_poly_seq_scheme.auth_seq_num'][ i_poly:index2] print pdb ## stop bool_append = True break if bool_append == True: break if bool_append == True: print pdb l_pdbs_out += [pdb] continue if l_indexes_unobs != []: print l_indexes_unobs stop fd = open('%s/%s' % ( path, fn_out, ), 'w') fd.write('\n'.join(l_pdbs_out)) fd.close() return
import sys sys.path.append('/home/tc/svn/tc_sandbox/pdb') import parse_mmCIF, mmCIF2coords sys.path.append('/home/tc/svn/GoodVibes') import NMA, visualization d_mmCIF = parse_mmCIF.main('2lzm',) d_coords, l_coords_alpha = mmCIF2coords.main('2lzm',d_mmCIF) cutoff = 10 matrix_hessian = NMA.hessian_calculation(l_coords_alpha, cutoff) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian,) visualization.vmd_trajectory('2lzm',l_coords_alpha,eigenvectors)
def parse_coords(pdb): d_mmCIF = parse_mmCIF.main(pdb,) d_coords, l_coords_alpha = mmCIF2coords.main(pdb,d_mmCIF) return d_mmCIF, l_coords_alpha
def main(): fd = open('radius_of_gyration.txt', 'r') lines = fd.readlines() fd.close() d_radii = {} for line in lines: l = line.strip().split() pdb = l[0] r = l[1] d_radii[pdb] = r lines_out = [] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' % (path, dn)): continue print '%s/%s %s' % (i + 1, len(l_dns), dn) l_fns = os.listdir('%s/%s' % (path, dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d_radii.keys(): continue print pdb fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb, lines, d_breaks={ ## break if multiple polymer types (not monomeric) '_entity_poly.entity_id': '2', ## '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR ## break if multiple chains '_entity_poly.pdbx_strand_id': ',', }, d_breaks_negation={ ## break if not x-ray diffraction '_exptl.method': 'X-RAY DIFFRACTION', ## break if not monomeric '_pdbx_struct_assembly.oligomeric_details': 'monomeric', }, l_data_categories=[ '_atom_site', '_entity_poly', '_pdbx_struct_assembly', ], ## parse selected data categories ) ## some unknown temporary error... or break before reaching this part when parsing... if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys( ): continue ## NMR structure? if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: stop2 continue ## no polymers in structure? if not '_entity_poly.entity_id' in d_mmCIF.keys(): continue ## polymer(s) is/are not polypeptide(s) if d_mmCIF['_entity_poly.type'] != len( d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']: continue ## biounit not monomeric if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len( d_mmCIF['_pdbx_struct_assembly.oligomeric_details']) * [ 'monomeric' ]: continue ## one polymer in assymetric unit if len(d_mmCIF['_entity_poly.entity_id']) > 1: continue print pdb ## ## calculate center of mass ## center_of_mass = numpy.array([ 0., 0., 0., ]) l_coords = [] l_masses = [] for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): if d_mmCIF['_atom_site.label_entity_id'][ i_atom_site] not in d_mmCIF['_entity_poly.entity_id']: continue element = d_mmCIF['_atom_site.type_symbol'][i_atom_site] ## only do heavy atoms if element == 'H': continue if element not in d_mass.keys(): print pdb, d_mmCIF['_atom_site.type_symbol'][i_atom_site] continue mass = d_mass[element] l_masses += [mass] x = float(d_mmCIF['_atom_site.Cartn_x'][i_atom_site]) y = float(d_mmCIF['_atom_site.Cartn_y'][i_atom_site]) z = float(d_mmCIF['_atom_site.Cartn_z'][i_atom_site]) coord = numpy.array([ x, y, z, ]) l_coords += [coord] center_of_mass += mass * coord center_of_mass /= sum(l_masses) ## ## calculate radius of gyration ## sum_r = 0 for i_coord in range(len(l_coords)): coord = l_coords[i_coord] mass = l_masses[i_coord] sq_dist_from_center_of_mass = sum((coord - center_of_mass)**2) sum_r += mass * sq_dist_from_center_of_mass radius_of_gyration = math.sqrt(sum_r / sum(l_masses)) print pdb, center_of_mass, radius_of_gyration line = '%s %s\n' % ( pdb, radius_of_gyration, ) lines_out += [line] fd = open('radius_of_gyration.txt', 'a') fd.write(line) fd.close() d_radii[pdb] = radius_of_gyration ## ## write calculated radii of gyration to file ## lines_out = [] for pdb, radius_of_gyration in d_radii.items(): line = '%s %s\n' % ( pdb, radius_of_gyration, ) lines_out += [line] fd = open('radius_of_gyration.txt', 'w') fd.writelines(lines_out) fd.close() return
def main(): l_fn_out = [ '_exptl_crystal_grow', '_exptl_crystal_grow_comp', ] d = {} for fn_out in l_fn_out: fd = open('db%s.txt' % (fn_out), 'r') lines = fd.readlines() fd.close() d[fn_out] = {} for line in lines: if line == '\n': continue pdb = line[:4] s = line[5:] d[fn_out][pdb] = s fd = open('remediation_exptl_crystal_grow.pH.txt', 'r') lines = fd.readlines() fd.close() l_pdbs = [line[:4] for line in lines] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' % (path, dn)): continue print '%s/%s %s' % (i + 1, len(l_dns), dn) l_fns = os.listdir('%s/%s' % (path, dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] ## continue if already in txt file from previous attempt to run loop ## if pdb in d['_exptl_crystal_grow_comp'].keys(): ## continue ## if pdb in d['_exptl_crystal_grow'].keys(): ## continue ## print pdb if not pdb in l_pdbs: continue fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb, lines, d_breaks_negation={ ## break if not x-ray diffraction '_exptl.method': 'X-RAY DIFFRACTION', }, l_data_categories_break=[ ## '_atom', '_diffrn', ], l_data_categories=[ ## parse selected data categories '_database_PDB_rev', '_pdbx_database_status', '_exptl', '_exptl_crystal_grow', '_exptl_crystal_grow_comp', ], ) ## ## no polymers in structure? ## if not '_entity_poly.entity_id' in d_mmCIF.keys(): ## continue if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue ## print pdb ## ## ## year = int(d_mmCIF['_database_PDB_rev.date'][0][:4]) process_site = ''.join( d_mmCIF['_pdbx_database_status.process_site']) if (not '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys() and not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys() ## ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() == '?' ): if process_site != '?': print pdb, year, process_site continue ## if '_exptl_crystal_grow.pdbx_details' in d_mmCIF.keys(): s_grow = ' '.join( d_mmCIF['_exptl_crystal_grow.pdbx_details']).strip() if ( ## pH not given d_mmCIF['_exptl_crystal_grow.pH'] in [ ['?'], [''], ['.'], ] and d_mmCIF['_exptl_crystal_grow.pdbx_pH_range'] in [ ['?'], [''], ['.'], ] and ## but pH in growth details (' PH ' in s_grow.upper() or 'PH=' in s_grow.upper() or ',PH ' in s_grow.upper())): fd = open('remediation_exptl_crystal_grow.pH.txt', 'a') fd.write('%s\t%s\t%s\t%4i\t%s\t%s\n' % ( pdb, ''.join(d_mmCIF['_exptl_crystal_grow.pH']), ''.join(d_mmCIF['_exptl_crystal_grow.pdbx_pH_range']), year, process_site, s_grow, )) fd.close() if (not '_exptl_crystal_grow_comp.name' in d_mmCIF.keys() or ''.join(d_mmCIF['_exptl_crystal_grow_comp.name']) in [ '.', '', '?', ]): if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys(): name = ''.join( d_mmCIF['_exptl_crystal_grow_comp.name']) else: name = 'N/A' ## ## remove end punctuation ## s = s_grow[:-1]+s_grow[-1].replace('.','') ## split ## l_grow_punctuation = s_grow.upper().split('. ') ## l_grow = l_grow_comma = [s_grow.upper().split(',') for s in l_grow_punctuation] l_grow = s_grow.upper().split(',') ## strip space l_grow = [x.strip() for x in l_grow] ## remove empty if '' in l_grow: l_grow.remove('') ## remove end punctuation l_grow = [x[:-1] + x[-1].replace('.', '') for x in l_grow] ## remove selected words from elements of list for x in [ 'CRYSTALS OBTAINED BY CO-CRYSTALLIZATION AT ', 'PROTEIN SOLUTION (', ]: for i_grow in range(len(l_grow)): l_grow[i_grow] = l_grow[i_grow].replace(x, '') ## replace abbreviations for i_grow in range(len(l_grow)): l_grow[i_grow] = l_grow[i_grow].replace( 'MILLIMOLAR', 'MM') ## remove selected words from list l_remove = [] for x in [ 'VAPOR DIFFUSION', 'VAPOUR DIFFUSION', 'HANGING DROP', 'SITTING DROP', ]: if x in l_grow: l_remove += [x] ## removed other selected words from list for i_grow in range(len(list(l_grow))): ## remove physical conditions bool_continue = False for x in [ 'TEMPERATURE', 'PH=', 'PH ', 'AT PH ', ]: if l_grow[i_grow][:len(x)] == x: l_remove += [l_grow[i_grow]] bool_continue = True break if bool_continue == True: continue ## remove long words (sentences) if len(l_grow[i_grow]) > 50: l_remove += [l_grow[i_grow]] break for remove in l_remove: l_grow.remove(remove) if len(l_grow) > 0: ## write to file line = '%s\t%s\t%s\t%4i\t%s\t%s\n' % ( pdb, name, l_grow, year, process_site, s_grow, ) fd = open( 'remediation_exptl_crystal_grow_comp.name.txt', 'a') fd.write(line) fd.close() else: s_grow = '' ## if '_exptl_crystal_grow_comp.name' in d_mmCIF.keys(): l_grow_comp = d_mmCIF['_exptl_crystal_grow_comp.name'] else: l_grow_comp = [] ## lines_out += [line] ## append to txt file in case loop doesn't finish d_lines = {} line = '%s %s\n' % ( pdb, s_grow, ) d_lines['_exptl_crystal_grow'] = line line = '%s %s\n' % ( pdb, l_grow_comp, ) d_lines['_exptl_crystal_grow_comp'] = line for fn_out in l_fn_out: fd = open('db%s.txt' % (fn_out), 'a') fd.write(d_lines[fn_out]) fd.close() ## append to dic for when loop finishes d['_exptl_crystal_grow'][pdb] = s_grow d['_exptl_crystal_grow_comp'][pdb] = l_grow_comp lines_out = [] for pdb, s in d.items(): line = '%s %s\n' % ( pdb, s, ) lines_out += [line] fd = open(fn_out, 'w') fd.writelines(lines_out) fd.close() return
def exclude(l_chainIDs): ## ## exclude obsolete structures and theoretical structures ## print 'obsolete/theoretical' print len(l_chainIDs) l_exclude = [] for chainID in l_chainIDs: if not os.path.isfile('/data/mmCIF/%s/%s.cif' % ( chainID[1:3], chainID[0:4], )): l_exclude += [chainID] for chainID in l_exclude: l_chainIDs.remove(chainID) print len(l_chainIDs) print ## ## exclude multidomain structures ## print 'multidomain' print len(l_chainIDs) fd = open('../CathDomall', 'r') lines = fd.readlines() fd.close() l_single_domain_chains = [] for line in lines: chainID = line[:5] if not chainID in l_chainIDs: continue n_domains = int(line[7:9]) if n_domains == 1: l_single_domain_chains += [chainID] l_chainIDs = list(set(l_chainIDs) & set(l_single_domain_chains)) print len(l_chainIDs) print ## ## exclude multichain biological units ## exclude non-x-ray structures ## print 'multichain' print len(l_chainIDs) l_exclude = [] l_pdbs_parsed = [] d_resolutions = {} for i_chainID in range(len(l_chainIDs)): chainID = l_chainIDs[i_chainID] print i_chainID, len(l_chainIDs), chainID pdbID = chainID[:4] if pdbID in l_pdbs_parsed: continue d_mmCIF = parse_mmCIF.main(pdbID) l_pdbs_parsed += [pdbID] if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: l_exclude += [pdbID] continue try: l_oligomeric_counts = d_mmCIF[ '_pdbx_struct_assembly.oligomeric_count'] except: print chainID continue if l_oligomeric_counts != len(l_oligomeric_counts) * ['1']: l_exclude += [pdbID] try: d_resolutions[pdbID] = float(''.join( d_mmCIF['_refine_hist.d_res_high'])) except: print chainID stop for chainID in list(l_chainIDs): if chainID[:4] in l_exclude: l_chainIDs.remove(chainID) print len(l_chainIDs) print ## ## exclude redundancies ## print 'redunant' print len(l_chainIDs) fd = open('../bc-50.out', 'r') lines = fd.readlines() fd.close() d = {} for i_line in range(len(lines)): line = lines[i_line] l_cluster = line.split() for i in range(len(l_cluster)): l_cluster[i] = l_cluster[i][:4].lower() + l_cluster[i][-1] l = list(set(l_cluster) & set(l_chainIDs)) if len(l) > 1: max_resolution = [ '', None, ] l.sort() for chainID in l: pdbID = chainID[:4] resolution = d_resolutions[pdbID] if resolution < max_resolution[0]: max_resolution = [ resolution, chainID, ] for chainID in l: if chainID != max_resolution[1]: l_chainIDs.remove(chainID) print len(l_chainIDs) print return l_chainIDs
def parse_GoodVibes_exclude_flexible(pdb,path,): ## ## calculate amplitudes ## d_mmCIF = parse_mmCIF.main(pdb[:4],) d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4],d_mmCIF,query_chain=pdb[-1]) print len(l_coords_alpha) ## ## eigenvector ## cutoff = 10 matrix_hessian = NMA.hessian_calculation(l_coords_alpha,cutoff,) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian) l_amplitudes = [ math.sqrt( eigenvectors[6][i]**2+eigenvectors[6][i+1]**2+eigenvectors[6][i+2]**2 ) for i in range(0,len(eigenvectors[6]),3) ] ## ## write pdb (color by bfactor) ## l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))] ## fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r') ## lines = fd.readlines() ## fd.close() ## index = [-1,None,] ## lines_out = [] ## for line in lines: ## record = line[:6].strip() ## if record != 'ATOM': ## lines_out += [line] ## else: ## res_no = int(line[22:26]) ## if res_no != index[1]: ## index = [index[0]+1,res_no,] ## bfactor = l_bfactors[index[0]] ## line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],) ## lines_out += [line_out] ## fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w') ## fd.writelines(lines_out) ## fd.close() ## average amplitude average = sum(l_amplitudes)/len(l_amplitudes) average,stddev = statistics.do_stddev(l_amplitudes) ## l_coords_rigid = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] < average: l_coords_rigid += [l_coords_alpha[i]] l_coords_flexible = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] > average+0.5*stddev: l_coords_flexible += [l_coords_alpha[i]] ## parse output fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r') lines = fd.readlines() fd.close() max_bfactor = None coord = None for line in lines: record = line[:6].strip() if record not in ['ATOM','HETATM',]: continue res_name = line[17:20] if res_name != 'EXT': continue bfactor = float(line[60:66]) if bfactor > max_bfactor: x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) ## coord_tmp = numpy.array([x,y,z,]) ## bool_vicinal_to_rigid = False ## for coord_rigid in l_coords_rigid: ## dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2)) ## if dist_from_rigid < 6: ## bool_vicinal_to_rigid = True ## break ## if bool_vicinal_to_rigid == False: ## continue ## bool_vicinal_to_flexible = False ## for coord_flexible in l_coords_flexible: ## dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2)) ## if dist_from_flexible < 6: ## bool_vicinal_to_flexible = True ## break ## if bool_vicinal_to_flexible == True: ## continue ## min_dist = [1000.,None,] ## for i_coord_alpha in range(len(l_coords_alpha)): ## coord_alpha = l_coords_alpha[i_coord_alpha] ## dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2)) ## if dist_from_alpha < min_dist[0]: ## min_dist = [dist_from_alpha,i_coord_alpha,] ## if l_amplitudes[min_dist[1]] > average+stddev: ## continue coord = numpy.array([x,y,z,]) max_bfactor = bfactor return coord
def one_polysaccharide(pdb, ): l_data_categories = [ '_entity', '_chem_comp', '_entity_poly', ] d = parse_mmCIF.main( pdb, l_data_categories=l_data_categories, ) bool_append = False bool_polysaccharide = False if '_chem_comp.type' in d.keys(): for chem_comp_type in d['_chem_comp.type']: if chem_comp_type.lower() in [ 'd-saccharide 1,4 and 1,4 linking', # 3amm 'l-saccharide', 'd-saccharide', 'saccharide' ]: bool_polysaccharide = True break ## elif 'acchar' in chem_comp_type.lower(): ## print d ## print chem_comp_type ## print pdb ## print set(['D-saccharide','saccharide'])&set(d['_chem_comp.type']) ## stop ## else: ## print pdb ## stop count_polymer_sugar = 0 bool_monosaccharide = False ## included to exclude 1a14 which contains polymers and monomers for i in range(len(d['_entity.type'])): entity_type = d['_entity.type'][i] if entity_type in [ 'polymer', ]: if d['_entity.pdbx_description'][i][:7] == 'SUGAR (': count_polymer_sugar += int( d['_entity.pdbx_number_of_molecules'][i]) continue ## ## polypeptide or polynucleotide (just a check) ## elif d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## eg 2c49 ## if d['_entity.id'][i] not in d['_entity_poly.entity_id']: ## print pdb ## stop elif entity_type == 'non-polymer' and d['_entity.pdbx_description'][ i][:5] == 'SUGAR': bool_monosaccharide = True ## ## just a check ## if d['_entity.pdbx_description'][i][:7] != 'SUGAR (' and pdb not in ['1iuc',]: ## print pdb ## print d['_entity.pdbx_description'][i] ## stop ## ## anything else named SUGAR? just a check ## elif entity_type != 'non-polymer' and d['_entity.pdbx_description'][i][:5] == 'SUGAR': ## print d ## print pdb ## print entity_type ## print d['_entity.pdbx_description'][i] ## stop if bool_monosaccharide == False and bool_polysaccharide == True and count_polymer_sugar == 1: bool_append = True ## elif pdb in ['3gvj','3gvk','3gvl','3hmy','3msg','1v0f',]: ## bool_append = False ## ## error check ## elif bool_polysaccharide == False and count_polymer_sugar > 0: ## print d ## print bool_polysaccharide ## print d['_entity.pdbx_description'] ## print count_polymer_sugar ## print pdb ## stop_no_poly_but_poly if pdb == '1dl2': print count_polymer_sugar print bool_append stop return bool_append
def main(): l_pdbs = [] fd = open('Biso_v_resolution.gnuplotdata', 'r') lines = fd.readlines() fd.close() for line in lines: l = line.split() resolution = float(l[1]) Biso = float(l[0]) if resolution > 3.5 and Biso < 10: print line if resolution > 2.5 and Biso < 10: print line if resolution > 2.0 and Biso < 5: print line ## if resolution > 1.5 and Biso < 5: ## print line pdb = l[2] l_pdbs += [pdb] Biso_average_prev = 0 l_dn = os.listdir(path) l_dn.sort() for dn in l_dn: if dn < sys.argv[-2]: continue if dn > sys.argv[-1]: continue if not os.path.isdir('%s/%s' % (path, dn)): continue l_fns = os.listdir('%s/%s' % (path, dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in l_pdbs: continue if pdb in [ '3bfn', ## PISA left out chains from biological unit '2jjg', '1qjb', ## _pdbx_struct_assembly missing ]: continue ## ## parse header ## d_mmCIF = parse_mmCIF.main( pdb, l_data_categories=[ '_pdbx_struct_assembly', '_entity_poly', '_citation', '_pdbx_database_related', ], ## parse selected data categories d_breaks_negation={ ## break if not x-ray diffraction '_exptl.method': 'X-RAY DIFFRACTION', }) if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']: continue if '_pdbx_database_related.content_type' in d_mmCIF.keys(): if 'split' in d_mmCIF['_pdbx_database_related.content_type']: continue try: if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != [ 'monomeric' ]: continue except: print pdb stop if not '_citation.id' in d_mmCIF.keys(): continue ## ## parse coordinate section ## d_mmCIF = parse_mmCIF.main( pdb, l_data_categories=[ '_database_PDB_rev', '_refine', '_refine_hist', '_atom_site', '_software', '_entity', '_entity_poly', '_pdbx_struct_assembly', '_pdbx_database_status', ], ## parse selected data categories d_breaks_negation={ ## break if not x-ray diffraction '_exptl.method': 'X-RAY DIFFRACTION', }) if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue if not 'polypeptide(L)' in d_mmCIF['_entity_poly.type']: continue if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != [ 'monomeric' ]: continue resolution = float(''.join(d_mmCIF['_refine.ls_d_res_high'])) if (int(d_mmCIF['_entity.pdbx_number_of_molecules'][0]) != 1 or len(d_mmCIF['_entity_poly.pdbx_strand_id']) > 1 or len(''.join( d_mmCIF['_entity_poly.pdbx_strand_id']).split(',')) > 1 or len(d_mmCIF['_entity_poly.entity_id']) > 1): print pdb print d_mmCIF['_entity.pdbx_number_of_molecules'] print d_mmCIF['_entity_poly.pdbx_strand_id'] stop entity_poly_id = int(''.join(d_mmCIF['_entity_poly.entity_id'])) for i_entity_poly in range(len(d_mmCIF['_entity_poly.entity_id'])): entity_poly_id = d_mmCIF['_entity_poly.entity_id'][ i_entity_poly] entity_poly_type = d_mmCIF['_entity_poly.entity_id'][ i_entity_poly] l_Biso = [] for i_atom_site in range(len(d_mmCIF['_atom_site.id'])): occupancy = float(d_mmCIF['_atom_site.occupancy'][i_atom_site]) if occupancy != 1: continue alt_id = d_mmCIF['_atom_site.label_alt_id'][i_atom_site] if alt_id != '.': continue entity_id = d_mmCIF['_atom_site.label_entity_id'][i_atom_site] if entity_id != entity_poly_id: continue comp_id = d_mmCIF['_atom_site.label_comp_id'][i_atom_site] if not comp_id in [ 'MSE', 'ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR', ]: continue type_symbol = d_mmCIF['_atom_site.type_symbol'][i_atom_site] if type_symbol == 'H': continue atom_id = d_mmCIF['_atom_site.label_atom_id'][i_atom_site] if not atom_id in [ 'N', 'CA', 'C', ]: continue Biso = float(d_mmCIF['_atom_site.B_iso_or_equiv'][i_atom_site]) l_Biso += [Biso] year = int(d_mmCIF['_database_PDB_rev.date'][0][:4]) site = ''.join(d_mmCIF['_pdbx_database_status.process_site']) if len(l_Biso) == 0: continue ## if l_Biso == len(l_Biso)*[l_Biso[0]]: ## print pdb, year, l_Biso[0:3] ## if year >= 2010: ## stop ## continue Biso_average = sum(l_Biso) / len(l_Biso) bool_continue = False for Biso in set(l_Biso): count = l_Biso.count(Biso) if count > 20: if '_software.name' in d_mmCIF.keys(): print pdb, Biso_average, Biso, count, d_mmCIF[ '_software.name'] s = '%s %6.2f %4i %6.2f %4i %s %s\n' % ( pdb, Biso, count, Biso_average, year, site, str(d_mmCIF['_software.name']), ) else: print pdb, Biso_average, Biso, count s = '%s %6.2f %4i %6.2f %4i %s\n' % ( pdb, Biso, count, Biso_average, year, site, ) bool_continue = True fd = open('remediation_Biso_duplicates.txt', 'a') fd.write(s) fd.close() break if bool_continue == True: continue ## if Biso_average in [2,3,4,5,6,7,8,9,99,90,50,20,25,1,100,10,0]: if Biso_average in range(0, 100 + 1): print l_Biso print Biso_average print pdb print year stop if '_refine.pdbx_TLS_residual_ADP_flag' in d_mmCIF.keys(): if ''.join(d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in [ 'UNVERIFIED', 'LIKELY RESIDUAL', ]: continue elif ''.join( d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag']) in [ '?', ]: pass else: print d_mmCIF['_refine.pdbx_TLS_residual_ADP_flag'] print pdb, Biso_average stop if round(Biso_average, 4) == round(Biso_average_prev, 4): print pdb, Biso_average, Biso_average_prev stop print pdb, round(Biso_average, 2), resolution fd = open('Biso_v_resolution.gnuplotdata', 'a') fd.write('%s %s %s %s\n' % ( Biso_average, resolution, pdb, year, )) fd.close() plot()
def unobs_nonterminal_residues(): ## ## unobs or zero occup not at terminals!!! (combination...) ## eg dont exlude 200l w 163,164 missing ## dont exclude 201l w 163,164 missing, but internally in _pdbx_poly_seq_scheme because 2 chains ## category = fn = '_pdbx_unobs_or_zero_occ_residues' fd = open('%s/list%s.txt' % (path, fn)) s = fd.read() fd.close() l_pdbs_in = s.split() l_data_categories = [ '_pdbx_poly_seq_scheme', '_pdbx_unobs_or_zero_occ_residues', '_entity_poly', ] fn_out = 'list_pdbx_unobs_residues__NONTERMINAL' loop_residues( category, fn_out, ) l_pdbs_out = [] for pdb in l_pdbs_in: ## if pdb[1:3] < 'oa': ## continue ## if pdb != '2hub': ## continue ## no residues are present! (e.g. 1oax, 1oay) if pdb in [ '1oax', '1oay', ]: continue d = parse_mmCIF.main( pdb, l_data_categories=l_data_categories, ) ## print pdb if not category in d.keys(): continue bool_append = False s = ''.join(d['_pdbx_poly_seq_scheme.pdb_strand_id']) for chains in d['_entity_poly.pdbx_strand_id']: for chain in chains.split(','): index1 = s.index(chain) index2 = s.rindex(chain) ## print chain l_auth_seq_num = d['_pdbx_poly_seq_scheme.auth_seq_num'][ index1:index2 + 1] while l_auth_seq_num[0] == '?': l_auth_seq_num = l_auth_seq_num[1:] while l_auth_seq_num[-1] == '?': l_auth_seq_num = l_auth_seq_num[:-1] ## non-terminal residues missing? if '?' in l_auth_seq_num: print '****', pdb bool_append = True break if bool_append == True: break if bool_append == True: print pdb l_pdbs_out += [pdb] ## continue fd = open('%s/%s' % ( path, fn_out, ), 'w') fd.write('\n'.join(l_pdbs_out)) fd.close() return
def main(): fn_out = 'db_MatthewsCoefficient.txt' fd = open(fn_out, 'r') lines = fd.readlines() fd.close() d = {} for line in lines: l = line.strip().split() pdb = l[0] v = l[1] if pdb == '2p51': v = '1.72610466393' d[pdb] = v lines_out = [] path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' % (path, dn)): continue print '%s/%s %s' % (i + 1, len(l_dns), dn) l_fns = os.listdir('%s/%s' % (path, dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d.keys(): continue ## Matthews Coefficient not calculated... if pdb in [ '1vh7', '1vho', '1vhu', '1vi3', '1vi4', '1vis', ]: continue ## Matthews Coefficient *wrong* if pdb in [ '2p51', ## too high '1c5v', '1q9i', '1ut6', '1x6x', '1x6y', '1xdn', '1y63', '1zix', ## too low '1t95', '1jih', '1t95', '1d5t', '1c7k', '1dbo', '1d9x', '1qt9', '1ia5', '1dcq', ]: continue fd = open('%s/%s/%s' % (path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb, lines, d_breaks={ ## break if multiple polymer types (not monomeric) '_entity_poly.entity_id': '2', ## '_exptl.method':'SOLUTION NMR', ## break if e.g. _exptl.method = SOLUTION NMR ## break if multiple chains '_entity_poly.pdbx_strand_id': ',', }, d_breaks_negation={ ## break if not x-ray diffraction '_exptl.method': 'X-RAY DIFFRACTION', ## break if not monomeric '_pdbx_struct_assembly.oligomeric_details': 'monomeric', }, l_data_categories=[ '_exptl_crystal', ], ## parse selected data categories l_data_categories_break=['_exptl_crystal']) ## some unknown temporary error... or break before reaching this part when parsing... if not '_pdbx_struct_assembly.oligomeric_details' in d_mmCIF.keys( ): continue ## NMR structure? if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: stop2 continue ## no polymers in structure? if not '_entity_poly.entity_id' in d_mmCIF.keys(): continue ## polymer(s) is/are not polypeptide(s) if d_mmCIF['_entity_poly.type'] != len( d_mmCIF['_entity_poly.type']) * ['polypeptide(L)']: continue ## biounit not monomeric if d_mmCIF['_pdbx_struct_assembly.oligomeric_details'] != len( d_mmCIF['_pdbx_struct_assembly.oligomeric_details']) * [ 'monomeric' ]: continue ## one polymer in assymetric unit if len(d_mmCIF['_entity_poly.entity_id']) > 1: continue if d_mmCIF['_exptl_crystal.density_Matthews'] == ['?']: v = VM = calc_matthews_coefficient.main(pdb) ## continue else: v = float(''.join(d_mmCIF['_exptl_crystal.density_Matthews'])) line = '%s %s\n' % ( pdb, v, ) fd = open(fn_out, 'a') fd.write(line) fd.close() d[pdb] = v ## ## write calculated radii of gyration to file ## lines_out = [] for pdb, v in d.items(): line = '%s %s\n' % ( pdb, v, ) lines_out += [line] fd = open(fn_out, 'w') fd.writelines(lines_out) fd.close() return
def main(): d = {} if os.path.isfile('db_resolution.txt'): fd = open('db_resolution.txt','r') lines = fd.readlines() fd.close() for line in lines: l = line.strip().split() pdb = l[0] v = l[1] d[pdb] = v path = '/media/WDMyBook1TB/2TB/mmCIF' l_dns = os.listdir(path) l_dns.sort() lines_out = [] for i in range(len(l_dns)): dn = l_dns[i] if dn < sys.argv[-1]: continue if not os.path.isdir('%s/%s' %(path,dn)): continue print '%s/%s %s' %(i+1,len(l_dns), dn) l_fns = os.listdir('%s/%s' %(path,dn)) l_fns.sort() for fn in l_fns: if fn[-3:] == '.gz': continue pdb = fn[0:4] if pdb in d.keys(): continue print pdb fd = open('%s/%s/%s' %(path, dn, fn), 'r') lines = fd.readlines() fd.close() d_mmCIF = parse_mmCIF.main( pdb,lines, l_data_categories = [ '_refine', '_refine_hist', ], ## parse selected data categories l_data_categories_break = [ '_refine', ## '_refine_hist', ], d_breaks_negation = { ## break if not x-ray diffraction '_exptl.method':'X-RAY DIFFRACTION', } ) if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: continue resolution = d_mmCIF['_refine.ls_d_res_high'] line = '%s %s\n' %(pdb,resolution,) lines_out += [line] fd = open('db_resolution.txt','a') fd.write(line) fd.close() d[pdb] = resolution ## ## write to file ## lines_out = [] for pdb,resolution in d.items(): line = '%s %s\n' %(pdb,resolution,) lines_out += [line] fd = open('db_resolution.txt','w') fd.writelines(lines_out) fd.close() d = {} fd = open('db_resolution.txt','r') lines = fd.readlines() fd.close() lines_out = [] for line in lines: resolution = line.strip().split()[1][2:-2] if resolution == '.': continue resolution = float(resolution) resolution = round(resolution,2) if not resolution in d.keys(): d[resolution] = 0 d[resolution] += 1 lines_out += ['%s\n' %(resolution,)] fd = open('histogram_resolution.txt','w') fd.writelines(lines_out) fd.close() stop lines_out = [] l_resolutions = d.keys() l_resolutions.sort() ## for resolution,count in d.items(): for resolution in l_resolutions: count = d[resolution] lines_out += ['%s %s\n' %(resolution,count,)] fd = open('histogram_resolution.txt','w') fd.writelines(lines_out) fd.close() return
def exclude(l_chainIDs): ## ## exclude obsolete structures and theoretical structures ## print 'obsolete/theoretical' print len(l_chainIDs) l_exclude = [] for chainID in l_chainIDs: if not os.path.isfile('/data/mmCIF/%s/%s.cif' %(chainID[1:3],chainID[0:4],)): l_exclude += [chainID] for chainID in l_exclude: l_chainIDs.remove(chainID) print len(l_chainIDs) print ## ## exclude multidomain structures ## print 'multidomain' print len(l_chainIDs) fd = open('../CathDomall','r') lines = fd.readlines() fd.close() l_single_domain_chains = [] for line in lines: chainID = line[:5] if not chainID in l_chainIDs: continue n_domains = int(line[7:9]) if n_domains == 1: l_single_domain_chains += [chainID] l_chainIDs = list( set(l_chainIDs) & set(l_single_domain_chains) ) print len(l_chainIDs) print ## ## exclude multichain biological units ## exclude non-x-ray structures ## print 'multichain' print len(l_chainIDs) l_exclude = [] l_pdbs_parsed = [] d_resolutions = {} for i_chainID in range(len(l_chainIDs)): chainID = l_chainIDs[i_chainID] print i_chainID, len(l_chainIDs), chainID pdbID = chainID[:4] if pdbID in l_pdbs_parsed: continue d_mmCIF = parse_mmCIF.main(pdbID) l_pdbs_parsed += [pdbID] if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: l_exclude += [pdbID] continue try: l_oligomeric_counts = d_mmCIF['_pdbx_struct_assembly.oligomeric_count'] except: print chainID continue if l_oligomeric_counts != len(l_oligomeric_counts)*['1']: l_exclude += [pdbID] try: d_resolutions[pdbID] = float(''.join(d_mmCIF['_refine_hist.d_res_high'])) except: print chainID stop for chainID in list(l_chainIDs): if chainID[:4] in l_exclude: l_chainIDs.remove(chainID) print len(l_chainIDs) print ## ## exclude redundancies ## print 'redunant' print len(l_chainIDs) fd = open('../bc-50.out','r') lines = fd.readlines() fd.close() d = {} for i_line in range(len(lines)): line = lines[i_line] l_cluster = line.split() for i in range(len(l_cluster)): l_cluster[i] = l_cluster[i][:4].lower()+l_cluster[i][-1] l = list( set(l_cluster) & set(l_chainIDs) ) if len(l) > 1: max_resolution = ['',None,] l.sort() for chainID in l: pdbID = chainID[:4] resolution = d_resolutions[pdbID] if resolution < max_resolution[0]: max_resolution = [resolution,chainID,] for chainID in l: if chainID != max_resolution[1]: l_chainIDs.remove(chainID) print len(l_chainIDs) print return l_chainIDs
print dn l_fn = os.listdir('%s/%s' %(path,dn,)) l_fn.sort() for fn in l_fn: pdb = fn[:4] if fn[-3:] == '.gz': continue ######## if pdb in ['2fl9','3gau','3gav','3gaw',]: ## tmp!!! ######## continue ## print pdb fd = open('%s/%s/%s' %(path,dn,fn), 'r') lines = fd.readlines() fd.close() d = parse_mmCIF.main( pdb,lines, l_data_categories = l_data_categories, d_breaks = d_breaks, ) if d_exclude_subset: bool_continue = False for item_exclude,l_values_exclude in d_exclude_subset.items(): if not item_exclude in d.keys(): bool_continue = True fd = open('%s/remediation_%s.txt' %(path,item_exclude,),'a') fd.write('%s\n' %(pdb)) fd.close() continue if len( set(d[item_exclude]) & set(l_values_exclude) ) > 0: bool_continue = True break
def parse_cifs( l_pdbs, ref_seq, l_db_codes, n_mutations_max, resolution_min, bool_multiple_entities=False, ): print 'parse cifs' n_mutants = 0 l_wts = [] l_wts_cysfree = [] d_mutants = {} d_mmCIF_main = {} for pdb in l_pdbs: if pdb[:4].lower() in d_mmCIF_main.keys(): continue d_mmCIF = parse_mmCIF.main(pdb[:4].lower(), ) ## not an x-ray structure if d_mmCIF['_exptl.method'] != ['X-RAY DIFFRACTION']: print pdb, d_mmCIF['_exptl.method'] continue ## more than one type of polymer present n_entities = len(d_mmCIF['_entity_poly.entity_id']) if bool_multiple_entities == False: if n_entities > 1: print pdb, 'entities', n_entities #, d_mmCIF['_struct.title'] continue ## low resolution if d_mmCIF['_refine.ls_d_res_high'] != d_mmCIF[ '_refine_hist.d_res_high']: print d_mmCIF['_refine.ls_d_res_high'] print d_mmCIF['_refine_hist.d_res_high'] stop if resolution_min: ## if float(d_mmCIF['_refine.ls_d_res_high'][0]) >= resolution_min: if float(d_mmCIF['_refine.ls_d_res_high'][0]) > resolution_min: print pdb, 'resolution', d_mmCIF['_refine.ls_d_res_high'] continue ## get entity ID from chain ID for i_entity in range(len(d_mmCIF['_entity_poly.entity_id'])): entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity] s_chain_ids = d_mmCIF['_entity_poly.pdbx_strand_id'][i_entity] if pdb[-1] in s_chain_ids: break if pdb[-1] not in s_chain_ids: print pdb print s_chain_ids stop ## get sequence from entity ID seq = [] for i in range(len(d_mmCIF['_entity_poly_seq.entity_id'])): if d_mmCIF['_entity_poly_seq.entity_id'][i] == entity_id: mon_id = d_mmCIF['_entity_poly_seq.mon_id'][i] if pdb[:4] == '1RCM' and i == 126: if mon_id != 'CYS': stop mon_id = 'CCS' seq += [mon_id] ## wrong chain length if ref_seq: if len(seq) != len(ref_seq): if ''.join(ref_seq) in ''.join(seq): print ref_seq print seq stop ## unobserved atoms not in seqres elif ''.join(seq) in ''.join(ref_seq): pass ## last two residues unobserved elif len(seq) == 162 and pdb in [ '1KS3_A', '1KW5_A', '1KW7_A', '1KY0_A', '1KY1_A', '1L0J_A', '1LOK_A', '1LPY_A', '1LW9_A', '1LWG_A', '1LWK_A', ]: pass ## last two residues unobserved elif len(seq) == 162 and seq[-1] == 'LYS': pass else: print pdb, 'seqlen', len(seq) continue ## not from Gallus gallus ## check not necessary, because sequence checked against ref seq entity_id = d_mmCIF['_entity_poly.entity_id'][i_entity] db_code = d_mmCIF['_struct_ref.db_code'][ d_mmCIF['_struct_ref.entity_id'].index(entity_id)] if db_code not in l_db_codes: print pdb, 'uniprot', db_code continue ## more than 1 mutation? if n_mutations_max != None: l_mutations = [] for i_seq in range(len(seq)): res_id_mmCIF = seq[i_seq] res_id_uniprot = ref_seq[i_seq] if res_id_mmCIF != res_id_uniprot: l_mutations += [ '%3s%i%3s' % ( res_id_uniprot, i_seq + 1, res_id_mmCIF, ) ] ## if len(l_mutations) == 1: if len(l_mutations) > n_mutations_max: print pdb, 'muts', len(l_mutations) continue elif len(l_mutations) > 0: n_mutants += 1 startmodel = parse_mmCIF_item( d_mmCIF, '_refine.pdbx_starting_model', pdb, ) ## append to lists and dictionaries d_mmCIF_main[pdb[:4]] = d_mmCIF if len(l_mutations) > 0: if l_mutations == ['CYS54THR', 'CYS97ALA']: l_wts_cysfree += [pdb] d_mutants[pdb] = { 'mutations': l_mutations, 'startmodel': startmodel } else: l_wts += [pdb] ## print 'd_mutants', d_mutants return d_mmCIF_main, l_wts, d_mutants, l_wts_cysfree
def parse_GoodVibes_exclude_flexible( pdb, path, ): ## ## calculate amplitudes ## d_mmCIF = parse_mmCIF.main(pdb[:4], ) d_coords, l_coords_alpha = mmCIF2coords.main(pdb[:4], d_mmCIF, query_chain=pdb[-1]) print len(l_coords_alpha) ## ## eigenvector ## cutoff = 10 matrix_hessian = NMA.hessian_calculation( l_coords_alpha, cutoff, ) eigenvectors, eigenvalues = NMA.diagonalize_hessian(matrix_hessian) l_amplitudes = [ math.sqrt(eigenvectors[6][i]**2 + eigenvectors[6][i + 1]**2 + eigenvectors[6][i + 2]**2) for i in range(0, len(eigenvectors[6]), 3) ] ## ## write pdb (color by bfactor) ## l_bfactors = [100*(l_amplitudes[i]-min(l_amplitudes))/(max(l_amplitudes)-min(l_amplitudes)) for i in range(len(l_amplitudes))] ## fd = open('output/%s/%s_%s_probe.pdb' %(path,pdb[:4],pdb[-1],),'r') ## lines = fd.readlines() ## fd.close() ## index = [-1,None,] ## lines_out = [] ## for line in lines: ## record = line[:6].strip() ## if record != 'ATOM': ## lines_out += [line] ## else: ## res_no = int(line[22:26]) ## if res_no != index[1]: ## index = [index[0]+1,res_no,] ## bfactor = l_bfactors[index[0]] ## line_out = '%s%6.2f%s' %(line[:60],bfactor,line[66:],) ## lines_out += [line_out] ## fd = open('output/%s/%s_%s_probe_color_by_amplitude.pdb' %(path,pdb[:4],pdb[-1],),'w') ## fd.writelines(lines_out) ## fd.close() ## average amplitude average = sum(l_amplitudes) / len(l_amplitudes) average, stddev = statistics.do_stddev(l_amplitudes) ## l_coords_rigid = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] < average: l_coords_rigid += [l_coords_alpha[i]] l_coords_flexible = [] for i in range(len(l_coords_alpha)): if l_amplitudes[i] > average + 0.5 * stddev: l_coords_flexible += [l_coords_alpha[i]] ## parse output fd = open('output/%s/%s_%s_probe.pdb' % ( path, pdb[:4], pdb[-1], ), 'r') lines = fd.readlines() fd.close() max_bfactor = None coord = None for line in lines: record = line[:6].strip() if record not in [ 'ATOM', 'HETATM', ]: continue res_name = line[17:20] if res_name != 'EXT': continue bfactor = float(line[60:66]) if bfactor > max_bfactor: x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) ## coord_tmp = numpy.array([x,y,z,]) ## bool_vicinal_to_rigid = False ## for coord_rigid in l_coords_rigid: ## dist_from_rigid = math.sqrt(sum((coord_rigid-coord_tmp)**2)) ## if dist_from_rigid < 6: ## bool_vicinal_to_rigid = True ## break ## if bool_vicinal_to_rigid == False: ## continue ## bool_vicinal_to_flexible = False ## for coord_flexible in l_coords_flexible: ## dist_from_flexible = math.sqrt(sum((coord_flexible-coord_tmp)**2)) ## if dist_from_flexible < 6: ## bool_vicinal_to_flexible = True ## break ## if bool_vicinal_to_flexible == True: ## continue ## min_dist = [1000.,None,] ## for i_coord_alpha in range(len(l_coords_alpha)): ## coord_alpha = l_coords_alpha[i_coord_alpha] ## dist_from_alpha = math.sqrt(sum((coord_alpha-coord_tmp)**2)) ## if dist_from_alpha < min_dist[0]: ## min_dist = [dist_from_alpha,i_coord_alpha,] ## if l_amplitudes[min_dist[1]] > average+stddev: ## continue coord = numpy.array([ x, y, z, ]) max_bfactor = bfactor return coord
def identify_CH_bonds(): ## ## identify all C-H single bonds in the standard residues ## d_atoms = {} for residue in [ 'ALA', ## 'ALA','CYS','ASP','GLU','PHE', ## 'GLY','HIS','ILE','LYS','LEU', ## 'MET','ASN','PRO','GLN','ARG', ## 'SER','THR','VAL','TRP','TYR', ]: lines = urllib2.urlopen('http://www.pdb.org/pdb/files/ligand/%s.cif' %(residue)).readlines() d = parse_mmCIF.main(residue,lines) d_atoms[residue] = [] for i in range(len(d['_chem_comp_bond.comp_id'])): if d['_chem_comp_bond.value_order'][i] != 'SING': continue atom1 = d['_chem_comp_bond.atom_id_1'][i] atom2 = d['_chem_comp_bond.atom_id_2'][i] ## heavy element is always listed before hydrogen if atom1[0] != 'C' or atom2[0] != 'H': continue print residue, d['_chem_comp_bond.atom_id_1'][i], d['_chem_comp_bond.atom_id_2'][i] d_atoms[residue] += [atom1] return d_atoms