def _add_colors(self): h_native = prody.HierView(self._native) h_prediction = prody.HierView(self._prediction) for (native_res, pred_res) in zip(h_native.iterResidues(), h_prediction.iterResidues()): native_coords = native_res.getCoords() pred_coords = pred_res.getCoords() d = numpy.linalg.norm(native_coords - pred_coords) pred_res.setTempFactors(d)
def must_be_filtered(cls, pdb): """ Checks if the structure has at least n proteic chains. """ hw = prody.HierView(pdb.select("protein")) return len(set([chain.getSequence() for chain in hw.iterChains()])) != 1
def register_all_ligand_onsite(self, hetero_part, OUT=True): for pick_one in pd.HierView(hetero_part).iterResidues(): # less than 3 atoms may be not ok if pick_one.numAtoms() <= 3: continue self.bundle_ligand_data(pick_one, fake_ligand=False, OUT=OUT)
def get_residue_onfly(self, resid): ''' :param resid: :return: ''' for pick_one in pd.HierView(self.hetero).iterResidues(): # less than 3 atoms may be not ok if str(pick_one.getResindex()) == resid: print 'here' self.bundle_ligand_data(pick_one, fake_ligand=False, OUT=True)
def process_water_structures(initial_pdb, main_chains, ligand): """ Detects the waters we have to keep (important for the simulation) and returns a structure holding them. Important waters are the ones closer to Template residue 50 (Ile), the aa is not but it is not guaranteed to be conserved, which means we have to rely into the residue number to choose it, and take any offset into account if needed. Extra: water molecules must be also close to the binding site. We will pick then the water that has minimum distance to the binding site and residue 50 :param initial_pdb: The pdb (prody structure) we want to extract the chains. :return: A dictionary indexed by the water id (res. num. + chain id) holding the prody pdb structure of that water. """ hw = prody.HierView(initial_pdb.select("protein")) water_structs = {} for chain in hw.iterChains(): if chain.getChid() in main_chains: # We cannot do a direct selection, instead we iterate for i, residue in enumerate(chain.iterResidues()): if i == 50: # 50th residue break residue_com = prody.calcCenter(residue) if ligand is None: ligand_com = prody.calcCenter(initial_pdb) else: ligand_com = prody.calcCenter(ligand) # Identify closer water waters = initial_pdb.select("name O and water") if waters is not None: distance_to_R50 = numpy.sqrt( ((residue_com - waters.getCoords())**2).sum(axis=1)) distance_to_BindSite = numpy.sqrt( ((ligand_com - waters.getCoords())**2).sum(axis=1)) distances = distance_to_R50 + distance_to_BindSite min_dist = numpy.min(distances) min_dist_index = numpy.where(distances == min_dist) water_resnum = waters.getResnums()[min_dist_index] water_chid = waters.getChids()[min_dist_index][0] water_id = "%d:%s" % (water_resnum, water_chid) # We use a dict in order to get rid of repeats selection_string = "resnum %d and chain %s" % (water_resnum, water_chid) water_structs[water_id] = initial_pdb.water.select( selection_string).copy() return water_structs
def align_and_color(self, native, prediction): ''' Performs alignment and assigns energies to B-factor column. ''' self.align(native, prediction) h = hamiltonian.EDENMHamiltonian(self._native.getCoords()) energy = h.evaluate_energy(self._prediction.getCoords()) energy_matrix = h.get_energy_matrix() atom_energy = numpy.sum(energy_matrix, axis=0) hier_view = prody.HierView(self._prediction) for index, residue in enumerate(hier_view.iterResidues()): residue.setBetas(atom_energy[index]) return self._align_results, self._native, self._prediction
def split_structure(pdb_path): pdb_name = os.path.basename(pdb_path).split('.')[0].lower() try: parsed = prody.parsePDB(pdb_path) except Exception as e: log('parse_failed.log', '{},{}\n'.format(pdb_name, str(e))) return hetero = parsed.select( '(hetero and not water) or resname ATP or resname ADP or sesname AMP or resname GTP or resname GDP or resname GMP' ) receptor = parsed.select('protein or nucleic') if receptor is None: log("select_failed.log", "{},doesn't have receptor.\n".format(pdb_name)) return if hetero is None: log("select_failed.log", "{},doesn't have ligand.\n".format(pdb_name)) return # write ligand into file ligand_flags = False for each in prody.HierView(hetero).iterResidues(): if each.select( 'not hydrogen').numAtoms() < config.heavy_atom_threshold: continue else: ligand_flags = True ResId = each.getResindex() ligand_path = os.path.join( config.splited_ligand_folder, pdb_name, "{}_{}_ligand.pdb".format(pdb_name, ResId)) mkdir(os.path.dirname(ligand_path)) prody.writePDB(ligand_path, each) # if have valid ligand, write down receptor if ligand_flags: receptor_path = os.path.join(config.splited_receptor_folder, pdb_name + '.pdb') prody.writePDB(receptor_path, receptor) else: log( "threshold_failed.log", "{}, no ligand above threshold {}.\n".format( pdb_name, config.heavy_atom_threshold))
def get_protein_sequence(pdb): """ Generates the 1 letter per residue sequence for a protein. Uses a dictionary that maps the 3 letter naming with the 1 letter naming convention Source: - Biskit (http://biskit.pasteur.fr/) @param pdb: A prody pdb data structure. @return: A string with the sequence of this protein. """ # One-liner just for the sake of the challenge return "".join([ aa_dic_standard[resname] if resname in aa_dic_standard else "X" for resname in [ residue.getResname().lower() for residue in prody.HierView(pdb).iterResidues() ] ])
def choose_main_chains(initial_pdb): """ We can have complexes attached to the chain or even duplicated chains that cover the same space (ex. in the same model, A and B are one structure and C and B form a duplicated protein). We only have to leave two of that main chains, and that's what this function does :) . :param initial_pdb: The pdb (prody structure) we want to extract the chains. :return: An array containing the chain ids of the main chains. """ hw = prody.HierView(initial_pdb.select("protein")) chain_lengths = [] for chain in hw.iterChains(): chain_lengths.append((len(chain.getSequence()), chain.getChid())) leave_chains = sorted(chain_lengths)[-2:] leave_chains = [chain_id for _, chain_id in leave_chains] return leave_chains
def __init__(self, PDB, filepos=None): self.PDBname = PDB self.heterodict = {} self.ct = 0 self.sequence = '' # filepos is to determine whether we download pdb files from wwPDB # or use what we have # Using downloaded is better try: if filepos is not None: parse = pd.parsePDB(filepos) else: parse = pd.parsePDB(PDB) except: # raise IOError logging.warning( 'PDB {} is ignored due to file-not-found error'.format(PDB)) return if not os.path.exists('data/' + PDB): os.mkdir('data/' + PDB) pd.writePDB('data/{0}/{0}.pdb'.format(PDB), parse) receptor = parse.select('protein') pd.writePDB('data/{0}/{0}_hydro_receptor.pdb'.format(PDB), receptor) repair_pdbfile('data/{0}/{0}_hydro_receptor.pdb'.format(PDB), PDB) hetero = parse.select( '(hetero and not water) or resname ATP or resname ADP') for pick_one in pd.HierView(hetero).iterResidues(): # less than 3 atoms may be not ok if pick_one.numAtoms() <= 3: continue ResId = str(pick_one.getResindex()) # Extract this ligand from protein (as input for openbabel) filename = 'data/{0}/{0}_{1}_ligand.pdb'.format(PDB, ResId) if not os.path.exists(filename): pd.writePDB(filename, pick_one)
def downloads(self,item): download_address = self.get_address(item) if os.path.exists(os.path.join(FLAGS.rowdata_folder,item+'.pdb')): print item," exists" return None print 'download ',item os.system('wget -P {} {}'.format(FLAGS.rowdata_folder,download_address)) pdbname = item.lower() ligand_folder = os.path.join(FLAGS.splited_ligand_folder,pdbname) try_create_chain_folder(ligand_folder) try: parsed = prody.parsePDB(os.path.join(FLAGS.rowdata_folder,item+'.pdb')) except: self.error_log('can not parse {}.\n'.format(item)) return None hetero = parsed.select('(hetero and not water) or resname ATP or resname ADP or sesname AMP or resname GTP or resname GDP or resname GMP') receptor = parsed.select('protein or nucleic') if receptor is None: self.error_log("{} doesn't have receptor.\n".format(item)) return None if hetero is None: self.error_log("{} doesn't have ligand.\n".format(item)) return None ligand_flags = False for each in prody.HierView(hetero).iterResidues(): if each.numAtoms() <= 10: continue else: ligand_flags = True ResId = each.getResindex() ligand_path = os.path.join(FLAGS.splited_ligand_folder,pdbname,"{}_{}_ligand.pdb".format(pdbname,ResId)) try_create_chain_parent_folder(ligand_path) prody.writePDB(ligand_path,each) if ligand_flags: receptor_path = os.path.join(FLAGS.splited_receptor_folder,pdbname+'.pdb') prody.writePDB(receptor_path,receptor) else: self.error_log("{} doesn't convert, not ligand have more than 10 atoms.\n".format(item))
def curate_struct(initial_pdb, main_chains, pdb_alignment, parameters): """ Returns the "curated" pdb. A curated pdb has potentially 2 waters around residue 50 of each chain, a ligand and two main (symmetric) chains; everything else must be deleted. This function will work even in the case that the 2 later are not present, which can happen when processing any of the "mandatory" structures (those can pass the filters automatically). :param initial_pdb: The prody pdb structure we want to extract the chains. :return: The "curated" pdb and the ligand """ # Get chain info (without ligand or waters) hw = prody.HierView(initial_pdb.select("protein")) pdb_alignment["pdb"]["num_chains"] = hw.numChains() # Pick main chains prot_struct = initial_pdb.select( CurationSelections.PROTEIN_CHAIN_TEMPLATE % (" ".join(main_chains))).copy() # Add the ligand (if found), must be part of other chains (not main_chains) ligand_struct = initial_pdb.select(CurationSelections.LIGAND_SELECTION) if ligand_struct is not None and ligand_struct.numAtoms( ) >= parameters["min_ligand_atoms"]: tmp_struct = prot_struct + ligand_struct.copy() else: tmp_struct = prot_struct # Add "important" waters, if found water_structs = process_water_structures(initial_pdb, main_chains, ligand_struct) pdb_alignment["pdb"]["waters"] = water_structs.keys( ) # Keep track of added waters in the alignment file for water_id in water_structs: tmp_struct = tmp_struct + water_structs[water_id] return tmp_struct, ligand_struct
def must_be_filtered(cls, pdb, num_chains): """ Checks if the structure has at least n proteic chains. """ hw = prody.HierView(pdb.select("protein")) return hw.numChains() != num_chains
def __init__(self, PDB, filepos=None, OUT=True, **kwargs): ''' :param PDB: name of PDB :param filepos: directory of where PDB file stores :param OUT: if true, splitted files will be output in './data' folder :param kwargs: for further extension ''' self.PDBname = PDB self.heterodict = {} self.ct = 0 self.sequence = {} self.pure_protein = None self.pure_nucleic = None self.pdb_filename = filepos.split('/')[-1] if 'BOX' in kwargs: self.BOX_range = kwargs['BOX'] else: self.BOX_range = 20 if 'Size' in kwargs: self.BOX_size = kwargs['Size'] else: self.BOX_size = 1 pdb_store_dir = os.path.join(temp_pdb_PREFIX, PDB) if not os.path.exists(pdb_store_dir): os.mkdir(pdb_store_dir) # filepos is to determine whether we download pdb files from wwPDB # or use what we have # Using downloaded is better # parse header for first time try: if filepos is not None: parse, header = pd.parsePDB(filepos, header=True) else: parse, header = pd.parsePDB(PDB, header=True) filepos = PDB + '.pdb.gz' except: #raise IOError print filepos logging.warning( 'PDB {} is ignored due to file-not-found error'.format(PDB)) return #Save resolution try: self.resolution = header['resolution'] except: self.resolution = 'NA' #Copy the file self.pure_protein = parse.select('protein') self.pure_nucleic = parse.select('nucleic') # dirty way to throw away nucleic one if self.pure_nucleic is not None: return copy_pdbfile(filepos, pdb_store_dir + '/{0}.pdb'.format(PDB), zipped=filepos.split('.')[-1] == 'gz') #repair by guess, i think repair_pdbfile(pdb_store_dir + '/{0}.pdb'.format(PDB), PDB) #Generating sequence here #storage = [] #split files by chain try: parse = pd.parsePDB(pdb_store_dir + '/{0}.pdb'.format(PDB)) except: raise IOError('Cannot parse added H') self.chain_list = [] for chain in parse.getHierView(): #print chain #for seq in storage: # if chain.getSequence()==seq: # continue self.chain_list.append(chain.getChid()) self.sequence[chain.getChid()] = chain.getSequence() #storage.append(chain.getSequence()) #now try to fix the pdb from autodock tools hetero = parse.select( '(hetero and not water) or resname ATP or resname ADP') other = parse.select('protein or nucleic') self.receptor = other # print parse.numAtoms(), hetero.numAtoms(), other.numAtoms() # if OUT: if other is not None: pd.writePDB(pdb_store_dir + '/{0}_receptor.pdb'.format(PDB), other) #repair_pdbfile('data/{0}/{0}_receptor.pdb'.format(PDB),PDB) else: return # Make vectors for every single hetero parts # Their values will be stored in a dict for pick_one in pd.HierView(hetero).iterResidues(): # less than 3 atoms may be not ok if pick_one.numAtoms() <= 3: continue self.bundle_ligand_data(pick_one, fake_ligand=False, OUT=OUT)
def downloads(self, item): ''' Download pdb from rcsb and split it into receptor and ligand :param item: 4 letter PDB ID '3EML' :return: ''' # Download pdb to rowdata_folder download_address = 'https://files.rcsb.org/download/' + item + '.pdb' os.system('wget -P {} {}'.format(FLAGS.rowdata_folder, download_address)) # create folder to store ligand pdbname = item.lower() ligand_folder = os.path.join(FLAGS.splited_ligand_folder, pdbname) if not os.path.exists(ligand_folder): os.mkdir(ligand_folder) # parse pdb try: parsed = prody.parsePDB( os.path.join(FLAGS.rowdata_folder, item + '.pdb')) except: self.error_log('can not parse {}.\n'.format(item)) return None # select receptor and ligand hetero = parsed.select( '(hetero and not water) or resname ATP or resname ADP') receptor = parsed.select('protein or nucleic') if receptor is None: self.error_log("{} doesn't have receptor.\n".format(item)) return None if hetero is None: self.error_log("{} doesn't have ligand.\n".format(item)) return None ligand_flags = False for each in prody.HierView(hetero).iterResidues(): if each.numAtoms() <= FLAGS.atom_num_threahold: # ignore ligand if atom num is less than threshold continue else: ligand_flags = True ResId = each.getResindex() ligand_path = os.path.join( FLAGS.splited_ligand_folder, pdbname, "{}_{}_ligand.pdb".format(pdbname, ResId)) if not os.path.exists(os.path.dirname(ligand_path)): os.mkdir(os.path.dirname(ligand_path)) prody.writePDB(ligand_path, each) if ligand_flags: receptor_path = os.path.join(FLAGS.splited_receptor_folder, pdbname + '.pdb') prody.writePDB(receptor_path, receptor) else: self.error_log( "{} doesn't convert, no ligand have more than 10 atoms.\n")
def get_str(index): structure.setACSIndex(index) return structure #Pairs phi/psi lists by residue for a given frame def phi_psi_pair(phis, psis): return 1 #Generates lists of phi and psi angles for all frames with mp.Pool() as pool: frame_list = pool.map(lambda x: int(x), frame_list) structure_list = pool.map(lambda x: get_str(x), frame_list) hier_list = pool.map(lambda x: prd.HierView(x), structure_list) res_list = map(lambda x, y: x.getResidue('A', y), hier_list, core_res) phi_list = pool.map(lambda x: prd.calcPhi(x), res_list) res_list = map(lambda x, y: x.getResidue('A', y), hier_list, core_res) psi_list = pool.map(lambda x: prd.calcPsi(x), res_list) phi_list = list(phi_list) psi_list = list(psi_list) #Generates the columns and rows for a dataframe to store all angles clmns = [] rows = {} for i in range(len(init_core_res)): clmns.append('phi' f'{i+1}') clmns.append('psi' f'{i+1}') #Generates a dataframe and stores all angle values
def downloads(self, item): #4 name of the function is not informative ''' Download pdb from rcsb and split it into receptor and ligand :param item: 4 letter PDB ID '3EML' :return: ''' # Download pdb to rowdata_folder download_address = self.get_address(item) os.system('wget -P {} {}'.format(FLAGS.rowdata_folder, download_address)) # create folder to store ligand pdbname = item.lower() ligand_folder = os.path.join(FLAGS.splited_ligand_folder, pdbname) if not os.path.exists(ligand_folder): os.mkdir(ligand_folder) # parse pdb try: parsed = prody.parsePDB( os.path.join(FLAGS.rowdata_folder, item + '.pdb')) except: self.error_log('can not parse {}.\n'.format(item)) return None # select receptor and ligand hetero = parsed.select( '(hetero and not water) or resname ATP or resname ADP') receptor = parsed.select('protein or nucleic') if receptor is None: self.error_log("{} doesn't have receptor.\n".format(item)) return None if hetero is None: self.error_log("{} doesn't have ligand.\n".format(item)) return None #5 I would create a printable class "statistics" ligand_flags = False for each in prody.HierView(hetero).iterResidues(): if each.numAtoms( ) <= FLAGS.atom_num_threahold: # 6there will be many thresholds # let's organize them together into a class FLAGS # ignore ligand if atom num is less than threshold continue else: ligand_flags = True ResId = each.getResindex() ligand_path = os.path.join( FLAGS.splited_ligand_folder, pdbname, "{}_{}_ligand.pdb".format(pdbname, ResId)) if not os.path.exists(os.path.dirname(ligand_path)): os.mkdir(os.path.dirname(ligand_path)) prody.writePDB(ligand_path, each) if ligand_flags: receptor_path = os.path.join( FLAGS.splited_receptor_folder, pdbname + '.pdb') # 7 splited receptor folder is a bad name prody.writePDB(receptor_path, receptor) else: self.error_log( "{} doesn't convert, no ligand have more than 10 atoms.\n" ) #8 look at #5 single class "statistics" would help