def test_single_file(tmpdir, pdb, options): """Basic regression test using propka.run.single and local file for the input PDB file""" ref_path, pdb_path = get_paths(pdb) filename = str(pdb_path) with tmpdir.as_cwd(): pkrun.single(filename, options) compare_output(pdb, Path.cwd(), ref_path)
def test_single_nopka(tmpdir): """Basic test to check that the pKa file is not written when write_pka is `False`""" pdb = "1FTJ-Chain-A" ref_path, pdb_path = get_paths(pdb) filename = f"{pdb}.pdb" with open(pdb_path, 'r') as writer: filestream = StringIO(writer.read()) pkrun.single(filename, stream=filestream, write_pka=False) assert not os.path.isfile(f"{pdb}.pka")
def test_single_extra_files_logwarn(tmpdir, caplog): """Tests that a logging warning is thrown if passing files via optargs""" pdb = "1FTJ-Chain-A" options = ('-f foo.pdb bar.pdb', '-f test.pdb test2.pdb') ref_path, pdb_path = get_paths(pdb) filename = str(pdb_path) with tmpdir.as_cwd(): pkrun.single(filename, options) wmsg = ("Ignoring extra filenames passed: [' foo.pdb bar.pdb', " "' test.pdb test2.pdb']") assert wmsg in caplog.records[0].message
def test_single_filestream(tmpdir, pdb, options): """Basic regression test using StringIO streams for the input PDB file""" ref_path, pdb_path = get_paths(pdb) filename = f"{pdb}.pdb" with open(pdb_path, 'r') as writer: filestream = StringIO(writer.read()) with tmpdir.as_cwd(): pkrun.single(filename, options, stream=filestream) compare_output(pdb, Path.cwd(), ref_path) filestream.close()
def test_single_propka_input(tmpdir): """Basic test to check that the propka_input file is written when `--generate-propka-input` is passed""" pdb = "1FTJ-Chain-A" options = ('--generate-propka-input', ) ref_path, pdb_path = get_paths(pdb) filename = f"{pdb}.pdb" with open(pdb_path, 'r') as writer: filestream = StringIO(writer.read()) with tmpdir.as_cwd(): pkrun.single(filename, options, stream=filestream) assert os.path.isfile(f"{pdb}.propka_input")
def _single_frame(self): pstream = mda.lib.util.NamedStream(StringIO(), self.tmpfile) self.ag.write(pstream) # reset stream for reading pstream.reset() try: # TODO: it would be nice to allow for other options, maybe for 3.2? mol = pk.single(pstream, optargs=['--quiet']) except (IndexError, AttributeError) as err: errmsg = "failure on frame: {0}".format(self._ts.frame) if not self.skip_failure: raise_from(RuntimeError(errmsg), err) else: warnings.warn(errmsg) self.num_failed_frames += 1 self.failed_frames_log.append(self._ts.frame) self.failed_times.append(self._ts.time) else: confname = mol.conformation_names[0] conformation = mol.conformations[confname] groups = conformation.get_titratable_groups() # extract pka estimates from each residue self._pkas.append([g.pka_value for g in groups]) if self._columns is None: self._columns = [g.atom.resNumb for g in groups] finally: # deallocate stream pstream.close(force=True)
def get_pka_dict(pdb_fp): protein = run.single(pdb_fp, write_pka = False) pka_string = protein.write_pka() pka_string = pka_string.strip() #Split into entries. pka_entries = pka_string.split("\n") #Remove headers pka_entries = pka_entries[3:] #Remove extraneous whitespace and split into columns for i in range(len(pka_entries)): entry = pka_entries[i] entry = entry.strip() entry = entry.replace("\t", " ") while " " in entry: entry = entry.replace(" ", " ") pka_entries[i] = entry.split(" ") #Filter out terminal pka's def entry_not_terminal(entry): if entry[0] == "N+": return False if entry[0] == "C-": return False return True pka_entries = list(filter(entry_not_terminal, pka_entries)) pka_dict = dict() for entry in pka_entries: residue_position = int(entry[1]) group = entry[2] pka = entry[3] if group not in pka_dict: pka_dict[group] = dict() if residue_position in pka_dict[group]: assert(False) pka_dict[group][residue_position] = float(pka) print(pka_dict) return pka_dict
def get_propka(universe, sel='protein', start=None, stop=None, step=None, skip_failure=False): """Get and store pKas for titrateable residues near the binding site. Parameters ---------- universe : :class:`MDAnalysis.Universe` Universe to obtain pKas for. sel : str, array_like Selection string to use for selecting atoms to use from given ``universe``. Can also be a numpy array or list of atom indices to use. start : int Frame of trajectory to start from. `None` means start from beginning. stop : int Frame of trajectory to end at. `None` means end at trajectory end. step : int Step by which to iterate through trajectory frames. propka is slow, so set according to how finely you need resulting timeseries. skip_failure : bool If set to ``True``, skip frames where PROPKA fails. If ``False`` raise an exception. The default is ``False``. Log file (at level warning) contains information on failed frames. Results ------- pkas : :class:`pandas.DataFrame` DataFrame giving estimated pKa value for each residue for each trajectory frame. Residue numbers are given as column labels, times as row labels. """ # need AtomGroup to write out for propka if isinstance(sel, string_types): atomsel = universe.select_atoms(sel) elif isinstance(sel, (list, np.array)): atomsel = universe.atoms[sel] # "filename" for our stream # use same name so that propka overwrites newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb') # progress logging output (because this is slow...) pm = mda.lib.log.ProgressMeter( universe.trajectory.n_frames, format="{step:5d}/{numsteps} t={time:12.3f} ps " "[{percentage:5.1f}%]", interval=1) times = [] pkas = [] failed_frames = 0 failed_frames_log = [] for ts in universe.trajectory[start:stop:step]: pm.echo(ts.frame, time=ts.time) # we create a named stream to write the atoms of interest into pstream = mda.lib.util.NamedStream(StringIO(), newname) atomsel.write(pstream) pstream.reset() # reset for reading # we feed the stream to propka, and it reads it as if it were a file on # disk try: mol = pk.single(pstream, optargs=['--quiet']) except (IndexError, AttributeError) as err: #https://github.com/Becksteinlab/propkatraj/issues/13 #https://github.com/Becksteinlab/propkatraj/issues/10 if not skip_failure: raise else: err_msg = "{0} (failure {2}): failing frame {1}".format( universe.trajectory.filename, ts.frame, failed_frames) failed_frames += 1 failed_frames_log.append(ts.frame) logging.warning(err_msg) continue finally: pstream.close(force=True) # deallocate # parse propka data structures to get out what we actually want confname = mol.conformation_names[0] conformation = mol.conformations[confname] groups = conformation.get_titratable_groups() # extract pka estimates from each residue pkas.append([g.pka_value for g in groups]) # record time times.append(ts.time) if failed_frames_log: logging.warning('number of failed frames = {0}'.format(failed_frames)) logging.warning('percent failure = {0:.3f}%'.format( float(failed_frames) / len(universe.trajectory) * 100)) logging.warning('failed frames: %r', failed_frames_log) # a `pandas.DataFrame` is a good data structure for this data df = pd.DataFrame(pkas, index=pd.Float64Index(times, name='time'), columns=[g.atom.resNumb for g in groups]) return df
def evaluate_pkas(self, protein): #First we transform protein to Standard protein.relabel(format="Standard") protein.write_pdb("_propka_inp.pdb") #Then we call propka from the import if not hasattr(run, "single"): logger.error("Propka not properly installed, or wrong version") raise exceptions.Propka_Error try: my_molecule = run.single("_propka_inp.pdb") except: logger.error("Error running propka") raise exceptions.Propka_Error if os.path.isfile("_propka_inp.propka_input"): logger.debug("Removing file: _propka_inp.propka_input") os.remove("_propka_inp.propka_input") logger.info("[propka] ==>> SUCCESS") #And run MSMS to generate the SAS if called for if self._buried_cutoff == "sas": try: pdb_to_xyzrn_cmd = config["PATHS"][ "MSMS_DIR"] + 'pdb_to_xyzrn _propka_inp.pdb > _msms_inp.xyzrn' msms_cmd = config["PATHS"][ "MSMS_DIR"] + 'msms.x86_64Linux2.2.6.1 -if _msms_inp.xyzrn -af _msms_out' os.system(pdb_to_xyzrn_cmd) os.system(msms_cmd) except: logger.error("Error running MSMS") raise exceptions.MSMS_Error #Now we move onto davids actual script for evaluation of the protons and what not #SAVE THE DATA if not os.path.isdir("save"): os.mkdir("save") if os.path.isfile("_propka_inp.pka"): shutil.copy("_propka_inp.pka", f"save/{self._step}.pka") if os.path.isfile("inConstr") and self._step > 0: shutil.copy("inConstr", f"save/{self._step-1}.inConstr") if os.path.isfile("_msms_out.area"): shutil.copy("_msms_out.area", f"save/{self._step}.area") self._step += 1 with open("_propka_inp.pdb", 'r') as in_pdb: #Get all of the titratable residues as a list titratable_residues = montecarlo.process_pdb(in_pdb.readlines()) ## REMOVE THE RESIDUES HERE THAT ARE STATIC remove_residues = [] logger.debug("Removing static protonation state residues...") for i, titratable_residue in enumerate(titratable_residues): residue = protein.get_residue( [titratable_residue.chain, int(titratable_residue.res_num)]) for default_state in self._default_protonation_states: default_state_residue = protein.get_residue(default_state) if residue == default_state_residue: remove_residues.append(i) remove_residues.reverse() [titratable_residues.pop(i) for i in remove_residues] #Define connections between residues montecarlo.define_connections(titratable_residues, PROTON_PARTNER_CUTOFF) if titratable_residues[0].chain: chains = True else: chains = False #propka output, titratable residues, and if we have multiple chains... calc_pKa_data = montecarlo.calc_pKa_total_pdb("_propka_inp.pka", titratable_residues, chains) for res in titratable_residues: res.assign_pKa(calc_pKa_data) solv_data = montecarlo.find_solv_shell("_propka_inp.pka", chains) if self._buried_cutoff == "sas": msms_data = montecarlo.store_sas_area("_msms_out.area", chains) titr_stack = [ ] # Construct the stack form of all_titr_res for use in find_solv_shell for res in titratable_residues: titr_stack += [res] #Need to make a copy so that we don't accidently screw up out list #Should check this...i think we want a copy of a list, but it pointing to the same res in all_titr_res all_networks = montecarlo.define_aa_networks(titr_stack) if self._buried_cutoff == "sas": all_networks = montecarlo.find_network_solvent_access( all_networks, msms_data, self._buried_cutoff, self._partner_dist) else: all_networks = montecarlo.find_network_solvent_access( all_networks, solv_data, self._buried_cutoff, self._partner_dist) #Now we do monte carlo montecarlo.MC_prot_change(all_networks, self._pH) for residue in titratable_residues: residue.update_prots() protonation_changes = [] remove = [] switch_his = [] for residue in titratable_residues: if residue.change[0] != "None": change = [] if residue.change[0] == "Add": protonate = True elif residue.change[0] == "Remove": protonate = False else: logger.warn("Unknown residue change command") continue change.extend([residue.chain, int(residue.res_num)]) # We check to see if the residue is static in regard to the protonation state... # change_residue = protein.get_residue(change) # ignore = False # for default_state in self._default_protonation_states: # default_state_residue = protein.get_residue(default_state) # if change_residue == default_state_residue: # logger.warn(f"Cannot change protonation state of static residue {default_state_residue}") # ignore = True # if ignore: # continue change.append("protonate" if protonate else "deprotonate") #Use -1 and -2 to deprotonate the C and N Terminus if residue.ter_name == 'N+' and not protonate: #check if we are deprotonating the n-terminus change.append(-1) elif residue.ter_name == 'C-' and protonate: #change.append(-2) logger.warn("Tried to protonate the c-terminus") logger.warn( "This has been turned off permanently due to DMD issues" ) continue else: if protonate: if residue.amino_acid.upper( ) not in constants.PROTONATED_STANDARD.keys(): logger.debug( "Cannot protonate residue {residue.amino_acid}" ) remove.append(change[:2]) continue elif residue.amino_acid.upper( ) == "HIS" and residue.change_heteroatom[0] == "ND1": switch_his.append(change[:2]) continue for index, pairs in enumerate( constants.PROTONATED_STANDARD[ residue.amino_acid.upper()]): if residue.change_heteroatom[0] == pairs[0]: change.append(index + 1) break else: if residue.amino_acid.upper( ) not in constants.DEPROTONATED_STANDARD.keys(): logger.debug( "Cannot deprotonate residue {residue.amino_acid}" ) remove.append(change[:2]) continue elif residue.amino_acid.upper( ) == "HIS" and residue.change_heteroatom[0] == "NE2": switch_his.append(change[:2]) continue for index, pairs in enumerate( constants.DEPROTONATED_STANDARD[ residue.amino_acid.upper()]): if residue.change_heteroatom[0] == pairs[0]: change.append(index + 1) break if len(change) != 4: logger.warn( f"Cannot change protonation state of atom {residue.change_heteroatom[0]} in res {residue.amino_acid} {residue.res_num}" ) continue protonation_changes.append(change) for change in protonation_changes: residue = protein.get_residue(change[:2]) for i, current in enumerate(self._updated_protonation): current_residue = protein.get_residue(current[:2]) if residue == current_residue: self._updated_protonation[i] = change #current = change if residue.name.upper() == "HIS": if change[2] == "protonate": pass else: self._updated_protonation.append( [change[0], change[1], "protonate"]) break else: if residue.name.upper() == "HIS": if change[2] == "protonate": pass else: self._updated_protonation.append( [change[0], change[1], "protonate"]) self._updated_protonation.append(change) if switch_his + remove: for switch in switch_his + remove: switch_res = protein.get_residue(switch[:2]) remove_index = [] for i in range(len(self._updated_protonation)): residue = protein.get_residue( self._updated_protonation[i][:2]) if residue == switch_res: remove_index.append(i) remove_index.reverse() [self._updated_protonation.pop(i) for i in remove_index] # Assign protonations to self._updated_protonation # List -> Tuple -> Set -> List to get rid of duplicates self._updated_protonation = [ list(item) for item in set( tuple(row) for row in self._updated_protonation) ] for state in self._updated_protonation: residue = protein.get_residue(state[:2]) if residue.name.upper() == "HIS" and state[2] == "deprotonate": for other_state in self._updated_protonation: other_residue = protein.get_residue(other_state[:2]) if residue == other_residue and other_state[ 2] == "protonate": break else: print("BRUHHHH!!!!!!!") raise exceptions.Propka_Error self._history.append(self._updated_protonation.copy()) return self._updated_protonation
def get_propka(universe, sel='protein', start=None, stop=None, step=None, skip_failure=False): """Get and store pKas for titrateable residues along trajectory. Parameters ---------- universe : :class:`MDAnalysis.Universe` Universe to obtain pKas for. sel : str, array_like Selection string to use for selecting atoms to use from given ``universe``. Can also be a numpy array or list of atom indices to use. start : int Frame of trajectory to start from. `None` means start from beginning. stop : int Frame of trajectory to end at. `None` means end at trajectory end. step : int Step by which to iterate through trajectory frames. propka is slow, so set according to how finely you need resulting timeseries. skip_failure : bool If set to ``True``, skip frames where PROPKA fails. If ``False`` raise an exception. The default is ``False``. Log file (at level warning) contains information on failed frames. Results ------- pkas : :class:`pandas.DataFrame` DataFrame giving estimated pKa value for each residue for each trajectory frame. Residue numbers are given as column labels, times as row labels. Notes ----- Currently, temporary :program:`propka` files are written in the same directory as the input trajectory file. This will leave a ``current.pka`` and ``current.propka_input`` file post-analysis. These are the temporary files for the final frame and can be removed. Should the trajectory file not have an input directory (e.g. when using MDAnalysis' `fetch_mmtf` method), then the files will be written to the current directory. Known issues: 1. Due to the current behaviour of the MDAnalysis PDBWriter, non-protein atoms are written to PDBs using `ATOM` records instead of `HETATM`. This is likely to lead to undefined behaviour in :program:`propka`, which will likely expect `HETATM` inputs. We recommend users to only pass protein atoms for now. See the following issue for more details: https://github.com/Becksteinlab/propkatraj/issues/24 """ # need AtomGroup to write out for propka if isinstance(sel, string_types): atomsel = universe.select_atoms(sel) elif isinstance(sel, (list, np.ndarray)): atomsel = universe.atoms[sel] # Issue #23 (keep until the PDBWriter is fixed) if len(atomsel.select_atoms('not protein')) > 0: wmsg = ("Non protein atoms passed to propka 3.1.\n MDAnalysis' " "PDBWriter does not currently write non-standard residues " "correctly as HETATM records and this may lead to " "incorrect pKa predictions.\n" "See https://github.com/Becksteinlab/propkatraj/issues/24 " " for more details") warnings.warn(wmsg) # "filename" for our stream # use same name so that propka overwrites try: newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb') except TypeError: # we have a trajectory without a directory newname = os.path.join(os.path.curdir, 'current.pdb') # progress logging output (because this is slow...) pm = mda.lib.log.ProgressMeter( universe.trajectory.n_frames, format="{step:5d}/{numsteps} t={time:12.3f} ps " "[{percentage:5.1f}%]", interval=1) times = [] pkas = [] failed_frames = 0 failed_frames_log = [] for ts in universe.trajectory[start:stop:step]: pm.echo(ts.frame, time=ts.time) # we create a named stream to write the atoms of interest into pstream = mda.lib.util.NamedStream(StringIO(), newname) atomsel.write(pstream) pstream.reset() # reset for reading # we feed the stream to propka, and it reads it as if it were a file on # disk try: mol = pk.single(pstream, optargs=['--quiet']) except (IndexError, AttributeError) as err: # https://github.com/Becksteinlab/propkatraj/issues/13 # https://github.com/Becksteinlab/propkatraj/issues/10 err_msg = "{0} (failure {2}): failing frame {1}".format( universe.trajectory.filename, ts.frame, failed_frames) if not skip_failure: raise_from(RuntimeError(err_msg), err) else: failed_frames += 1 failed_frames_log.append(ts.frame) logging.warning(err_msg) continue finally: pstream.close(force=True) # deallocate # parse propka data structures to get out what we actually want confname = mol.conformation_names[0] conformation = mol.conformations[confname] groups = conformation.get_titratable_groups() # extract pka estimates from each residue pkas.append([g.pka_value for g in groups]) # record time times.append(ts.time) if failed_frames_log: logging.warning('number of failed frames = {0}'.format(failed_frames)) logging.warning('percent failure = {0:.3f}%'.format( float(failed_frames) / len(universe.trajectory) * 100)) logging.warning('failed frames: %r', failed_frames_log) # a `pandas.DataFrame` is a good data structure for this data df = pd.DataFrame(pkas, index=pd.Float64Index(times, name='time'), columns=[g.atom.resNumb for g in groups]) return df
def get_propka(universe, sel='protein', start=None, stop=None, step=None): """Get and store pKas for titrateable residues near the binding site. Parameters ---------- universe : :class:`MDAnalysis.Universe` Universe to obtain pKas for. sel : str, array_like Selection string to use for selecting atoms to use from given ``universe``. Can also be a numpy array or list of atom indices to use. start : int Frame of trajectory to start from. `None` means start from beginning. stop : int Frame of trajectory to end at. `None` means end at trajectory end. step : int Step by which to iterate through trajectory frames. propka is slow, so set according to how finely you need resulting timeseries. Results ------- pkas : :class:`pandas.DataFrame` DataFrame giving estimated pKa value for each residue for each trajectory frame. Residue numbers are given as column labels, times as row labels. """ # need AtomGroup to write out for propka if isinstance(sel, string_types): atomsel = universe.select_atoms(sel) elif isinstance(sel, (list, np.array)): atomsel = universe.atoms[sel] # "filename" for our stream # use same name so that propka overwrites newname = os.path.join(os.path.dirname(universe.filename), 'current.pdb') # progress logging output (because this is slow...) pm = mda.lib.log.ProgressMeter( universe.trajectory.n_frames, format="{step:5d}/{numsteps} t={time:12.3f} ps " "[{percentage:5.1f}%]", interval=1) times = [] pkas = [] for ts in universe.trajectory[start:stop:step]: pm.echo(ts.frame, time=ts.time) # we create a named stream to write the atoms of interest into pstream = mda.lib.util.NamedStream(cStringIO.StringIO(), newname) atomsel.write(pstream) pstream.reset() # reset for reading # we feed the stream to propka, and it reads it as if it were a file on # disk mol = pk.single(pstream, optargs=['--quiet']) pstream.close(force=True) # deallocate # parse propka data structures to get out what we actually want confname = mol.conformation_names[0] conformation = mol.conformations[confname] groups = conformation.get_titratable_groups() # extract pka estimates from each residue pkas.append([g.pka_value for g in groups]) # record time times.append(ts.time) # a `pandas.DataFrame` is a good data structure for this data df = pd.DataFrame(pkas, index=pd.Float64Index(times, name='time'), columns=[g.atom.resNumb for g in groups]) return df