def __init__(self, Protein, solvent_box, MD_program_path='orac', number_of_cores_per_node=64, number_of_replicas=8): self.Protein = Protein self.solvent_box = solvent_box self.MD_program_path = path.absolute_programpath( program=MD_program_path) self.number_of_cores_per_node = number_of_cores_per_node self.orac_in_file = f"HREM.in" #dummy useless value self.kind_of_processor = 'skylake' #an instance of orient.Orient class self.orient = orient.Orient(self.Protein, self.Protein.get_ligand_list()) self.BATTERIES = self._get_BATTERIES() self.replicas = number_of_replicas
def __init__(self, Protein, MD_program_path='orac'): self.Protein = Protein self.MD_program_path = MD_program_path self.orient = orient.Orient(Protein=self.Protein, Ligand=self.Protein.get_ligand_list()) #will be filled in later on self.template = []
def __init__(self, Protein, solvent_pdb=None, MD_program_path='orac'): """ Protein :: HPC_Drug.structures.protein.Protein instance solvent_pdb :: string, it is the pdb file that contains the coordinates of a solvent molecule it is needed if there has to be added a solvent box around the protein default HPC_Drug.lib "water.pdb" MD_program_path :: string, the absolute path to the orac executable dafault will look for an executable called orac in the PATH and the working directory (in this order) """ self.Protein = Protein self.orac_in_file = os.getcwd() + f"/{self.Protein.protein_id}_orac.in" self.solvent_pdb = solvent_pdb #if no path is given searches the standard water.pdb inside lib module if self.solvent_pdb == None: with importlib_resources.path('HPC_Drug.lib', 'water.pdb') as _path: self.solvent_pdb = str(_path.resolve()) self.MD_program_path = path.absolute_programpath( program=MD_program_path) self.output_pdb_file = os.getcwd( ) + f"/{self.Protein.protein_id}_orac.pdb" #an instance of orient.Orient class self.orient = orient.Orient(self.Protein, self.Protein.get_ligand_list()) self.template = [] #some values that are needed many times are calculated in the constructor #The box sizes (lx, ly, lz) #the structure is rotated in its tensor of inertia ref self.box = self._create_selfbox()
def __init__(self, tpg_file, prm_file, solvent_box=None, MD_program_path="orac", chain="A"): self.tpg_file = tpg_file self.prm_file = prm_file self.solvent_box = solvent_box if self.solvent_box is None: with importlib_resources.path('HPC_Drug.lib', 'only_water.pdb') as path: shutil.copy(str(path.resolve()), os.getcwd()) self.solvent_box = os.getcwd() + "/" + "only_water.pdb" self.MD_program_path = MD_program_path self.chain = chain self.orac_in_file = os.getcwd() + "/only_water_orac.in" self.output_pdb_file = "optimized_only_solvent_box.pdb" self.Protein = protein.Protein(protein_id="slvt", pdb_file=self.solvent_box, chain=self.chain, file_type="pdb") self.orient = orient.Orient(Protein=self.Protein, Ligand=[]) self.box = self._create_selfbox() self.template = [ "#&T NTHREADS 8 CACHELINE 16", "#&T NT-LEVEL1 2 CACHELINE 16", "#&T NT-LEVEL2 4 CACHELINE 16", "###############################################################", "# Minimize Crystallographic structure from PDBank", "###############################################################", "", "! this is a comment", "!! two exclamation points: system-dependent section" "! one exclamation point: system indipendent section (same for all inputs)", "#", "# Set MD cell and read pdb coordinates", "#", "&SETUP", self._write_box(), f"READ_PDB {self.solvent_box}", "&END", "#", "# reads the force fields", "#", "&PARAMETERS", f" READ_TPG_ASCII {self.tpg_file} ! protein", f" READ_PRM_ASCII {self.prm_file} ! protein", f" WRITE_TPGPRM_BIN only_water.tpgprm", " JOIN SOLVENT ! solvent", " tip3", " END", "&END", "&SOLVENT", f"ADD_UNITS {self._get_number_of_residues()}", "&END", "&SIMULATION ! simulation parameters (same for all)", " MDSIM", " TEMPERATURE 280.0 20.0", " ISOSTRESS PRESS-EXT 0.1 BARO-MASS 30.0", " THERMOS", " solute 10.0", " solvent 10.0", " cofm 10.0", " temp_limit 1000.0", " END", "&END", "&INTEGRATOR ! integration parameters (same for all)", " TIMESTEP 9.0", " MTS_RESPA", " step intra 2", " step intra 2", " step nonbond 2 5.1", " step nonbond 5 7.8 reciprocal", " step nonbond 1 10.0", " test_times OPEN G0.tt 20", " very_cold_start 0.1", " END", "&END", "&POTENTIAL !! potential parameters", self._write_EWALD_PME(), self._write_ADD_STR_COM(), " UPDATE 60.0 1.8", self._write_LINKED_CELL(), " STRETCHING HEAVY", " QQ-FUDGE 0.83333", " LJ-FUDGE 0.50", "&END", "&RUN ! run lenght (same for all)", " CONTROL 0", " PROPERTY 20000.0", " REJECT 20000.0", " TIME 6000.0", " STEER 0.0 30000.0", " PRINT 300.0", "&END", "", "#", "# write restart file every 60.0 (approximately)", "#", "&INOUT ! files I/O", " RESTART", f" write 15000.0 OPEN {self.output_pdb_file.rsplit('.', 1)[0].strip()}.rst", " END", f" ASCII 3000.0 OPEN {self.output_pdb_file}", f" PLOT STEER_ANALYTIC 500.0 OPEN {self.output_pdb_file.rsplit('.', 1)[0].strip()}.dat", "&END" ]
def __init__(self, Protein, MD_program_path = 'gmx', kind_of_processor = 'skylake', number_of_cores_per_node = 64, use_gpu = 'auto', gpus_per_node = 1, number_of_replicas = 8, batteries = None, n_steps=None, timestep=None, constraints='h-bonds'): super().__init__(Protein = Protein, MD_program_path = MD_program_path) #the input directory that will be created self.HREM_dir = f"{self.Protein.protein_id}_HREM" self.elaborated_top_file = f"{self.Protein.protein_id}_elaborated_topology.top" self.mdp_file = f"{self.Protein.protein_id}_HREM.mdp" self.output_tpr_file = f"HREM.tpr" self.kind_of_processor = kind_of_processor self.number_of_cores_per_node = number_of_cores_per_node #an instance of orient.Orient class self.orient = orient.Orient(self.Protein, self.Protein.get_ligand_list()) #gromacs has various options to use gpu #auto (default) that will use all the available ones automatically #cpu uses no GPU even if available #gpu forces the use of GPU (but in case you want to use a gpu auto would be safer and more robust) self.use_gpu = use_gpu.lower().strip() if self.use_gpu not in ('auto', 'cpu', 'gpu'): raise ValueError(f"{self.use_gpu} is not a valid gpu option, valid options are auto cpu gpu") self.gpus_per_node = gpus_per_node if batteries is None: self.BATTERIES = self._get_BATTERIES() else: self.BATTERIES = batteries #the replicas for BATTERY self.replicas = number_of_replicas self.temperature = 298.15 if timestep is None: self.timestep = 0.002 else: self.timestep = timestep if n_steps is None: self.n_steps = int(self._get_ns_per_day() * 1.E+3) else: self.n_steps = int(n_steps) if constraints is None: constraints = 'h-bonds' self.constraints = constraints self.template = [ "; VARIOUS PREPROCESSING OPTIONS", "; Preprocessor information: use cpp syntax.", "; e.g.: -I/home/joe/doe -I/home/mary/roe", "include =", "; e.g.: -DPOSRES -DFLEXIBLE (note these variable names are case sensitive)", "define =", "", "; RUN CONTROL PARAMETERS", "integrator = md", "; Start time and timestep in ps", self._write_TIME_TIMESTEP_string(), "; For exact run continuation or redoing part of a run", "init-step = 0", "; Part index is updated automatically on checkpointing (keeps files separate)", "simulation-part = 1", "; mode for center of mass motion removal", "comm-mode = Linear", "; number of steps for center of mass motion removal", "nstcomm = 100", "; group(s) for center of mass motion removal", "comm-grps =", "", "; TEST PARTICLE INSERTION OPTIONS", "rtpi = 0.05", "", "; OUTPUT CONTROL OPTIONS", "; Output frequency for coords (x), velocities (v) and forces (f)", "nstxout = 100000", "nstvout = 100000", "nstfout = 100000", "; Output frequency for energies to log file and energy file", "nstlog = 100000", "nstcalcenergy = 100", "nstenergy = 100000", "; Output frequency and precision for .xtc file", "nstxtcout = 80000", "xtc-precision = 1000", "; This selects the subset of atoms for the .xtc file. You can", "; select multiple groups. By default all atoms will be written.", "xtc-grps =", "; Selection of energy groups", "energygrps = System", "", "; NEIGHBORSEARCHING PARAMETERS", "; cut-off scheme (group: using charge groups, Verlet: particle based cut-offs)", "; nblist update frequency", "cutoff-scheme = Verlet", "nstlist = 20", "verlet-buffer-tolerance = 0.0001", "; ns algorithm (simple or grid)", "ns_type = grid", "; Periodic boundary conditions: xyz, no, xy", "pbc = xyz", "periodic-molecules = no", "; Allowed energy drift due to the Verlet buffer in kJ/mol/ps per atom,", "; a value of -1 means: use rlist", "; nblist cut-off", "rlist = 1.0", "; long-range cut-off for switched potentials", "rlistlong = -1", "", "; OPTIONS FOR ELECTROSTATICS AND VDW", "; Method for doing electrostatics", "coulombtype = PME", "rcoulomb-switch = 0", "rcoulomb = 1.0", "; Relative dielectric constant for the medium and the reaction field", "epsilon-r = 1", "epsilon-rf = 0", "; Method for doing Van der Waals", "vdw-type = Cut-off", "; cut-off lengths", "rvdw-switch = 0", "rvdw = 1.0", "; Apply long range dispersion corrections for Energy and Pressure", "DispCorr = EnerPres", "; Extension of the potential lookup tables beyond the cut-off", "table-extension = 1", "; Separate tables between energy group pairs", "energygrp-table =", "; Spacing for the PME/PPPM FFT grid", "fourierspacing = 0.12", "; FFT grid size, when a value is 0 fourierspacing will be used", "fourier-nx = 0", "fourier-ny = 0", "fourier-nz = 0", "; EWALD/PME/PPPM parameters", "pme-order = 4", "ewald-rtol = 1e-06", "ewald-geometry = 3d", "epsilon-surface =", "optimize-fft = no", "", "; IMPLICIT SOLVENT ALGORITHM", "implicit-solvent = No", "", "; OPTIONS FOR WEAK COUPLING ALGORITHMS", "; Temperature coupling", "tcoupl = v-rescale", "nsttcouple = -1", "nh-chain-length = 1", "; Groups to couple separately", "tc-grps = System", "; Time constant (ps) and reference temperature (K)", "tau-t = 0.2", f"ref-t = {self.temperature}", "; pressure coupling", "pcoupl = Parrinello-Rahman", "pcoupltype = Isotropic", "nstpcouple = -1", "; Time constant (ps), compressibility (1/bar) and reference P (bar)", "tau-p = 1.0", "compressibility = 4.6e-5", "ref-p = 1", "; Scaling of reference coordinates, No, All or COM", "refcoord-scaling = COM", "", "; GENERATE VELOCITIES FOR STARTUP RUN", "gen-vel = no", "gen-temp = 500", "gen-seed = 173529", "", "; OPTIONS FOR BONDS", f"constraints = {self.constraints}", "; Type of constraint algorithm", "constraint-algorithm = Lincs", "; Do not constrain the start configuration", "continuation = no", "; Use successive overrelaxation to reduce the number of shake iterations", "Shake-SOR = no", "; Relative tolerance of shake", "shake-tol = 0.00001", "; Highest order in the expansion of the constraint coupling matrix", "lincs-order = 5", "; Number of iterations in the final step of LINCS. 1 is fine for", "; normal simulations, but use 2 to conserve energy in NVE runs.", "; For energy minimization with constraints it should be 4 to 8.", "lincs-iter = 2", "; Lincs will write a warning to the stderr if in one step a bond", "; rotates over more degrees than", "lincs-warnangle = 30", "; Convert harmonic bonds to morse potentials", "morse = no" ]
def get_metal_binding_residues_with_no_header(structure, cutoff = 3.0, protein_chain = 'A', protein_model = 0, COM_distance = 10.0, metals = important_lists.metals): """ This function gets called by get_metalbinding_disulf_ligands This function iterates through the structure many times in order to return the metal binding residues through a substitution dictionary {residue_id : [residue_name, binding_atom, binding_metal]} It uses biopython structures structure :: a biopython structure of the protein cutoff :: double the maximum distance that a residue's center of mass and a metal ion can have to be considered binding default 3.0 angstrom protein_chain :: string default 'A', if == None no chain selection will be done protein_model :: integer default 0, if == None no model and no chain selection will be done metals :: a list (or tuple etc) that contains all the resnames (in capital letters) of metals necessary to look for, default HPC_Drug.important_lists.metals (Actually the easiest way to personalize metals is to append your custom values to this list) this function is slow and error prone and should only be used if there is no mmCIF with a good header It should not be necessary to change COM_distance because it simply is the distance between the center of mass of a residue and the metal that is used to know which atom distances to calculate """ orient_object = orient.Orient() substitutions_dict = {} #selects model and chain if required if protein_model != None: try: model = structure[protein_model] except KeyError: model = structure # select only the right chain if protein_chain != None: try: chain = model[protein_chain.strip().upper()] except KeyError: chain = model else: chain = model else: chain = structure _chain = copy.deepcopy(chain) for residue in _chain: if residue.resname.strip().upper() in metals: for atom in residue: #I get a second copy of all the residues in the chain tmp_struct = copy.deepcopy(chain) all_residues = tmp_struct.get_residues() #and iterate though them for other_residue in all_residues: #I avoid scanning the metal against it's self and against trash residues if other_residue.resname.strip().upper() not in important_lists.metals: COM_1, COM_2, distance = orient_object.center_mass_distance(structure_1 = residue, structure_2 = other_residue) if distance <= COM_distance: TMP_atom_dist = [1.E+20, 'DUMMY'] #check for the nearest atom of the binding residue for other_atom in other_residue: d = (atom.coord[0] - other_atom.coord[0])**2. + (atom.coord[1] - other_atom.coord[1])**2. + (atom.coord[2] - other_atom.coord[2])**2. d = d ** (0.5) if d < TMP_atom_dist[0]: try: TMP_atom_dist = [d, other_atom.name.upper()] except: TMP_atom_dist = [d, other_atom.element.upper()] #checking if the nearest atom is near enough to be part of a binding residue if TMP_atom_dist[0] <= cutoff: #I add the other residue _id to the dictionary keys and give a value substitutions_dict[other_residue.id[1]] = [other_residue.resname.strip().upper(), TMP_atom_dist[1], residue.resname.strip().upper()] #useless variables COM_1 = None COM_2 = None return substitutions_dict
def execute(self): """ A pipeline that returns a clean and repaired "protein and ions" PDB and a PDB file for any not trash organic lingand starting from both a PDB, an mmCIF file or a protein id returns a Protein instance """ #If requested in input will download pdb file #If the given local file doesn't exist raises FileNotFounfError #otherwise updates self.protein_filename with the given path #all the paths are converted to absolute paths self.get_protein_file() # creating protein instance Protein = protein.Protein(protein_id = self.protein_id, pdb_file = self.protein_filename, model = self.model, chain = self.chain, file_type = self.protein_filetype, tpg_file = self.protein_tpg_file, prm_file = self.protein_prm_file) #Get Protein.substitutions_dict Protein.sulf_bonds #repairs the Protein.pdb_file #returns a list containing the resnames and resnumbers of organic ligands # [[resname, resnum], [...], ...] #if there are none will be None item Info_rep = structural_information_and_repair.InfoRepair(Protein = Protein, repairing_method = self.repairing_method) Protein, ligand_resnames_resnums = Info_rep.get_info_and_repair() #remove still present disordered atoms (if any) Protein = remove_disordered_atoms.remove_disordered_atoms(Protein = Protein) #selects only a selected model and chain (Protein.model Protein.chain) Protein = select_model_chain.select_model_chain(Protein = Protein) #if the protein was a mmCIF I convert it to PDB Protein = mmcif2pdb.mmcif2pdb(Protein = Protein) #create the Ligand instances and add them to Protein._ligands Protein = get_ligands.get_ligands(Protein = Protein, ligand_resnames_resnums = ligand_resnames_resnums) Protein.update_structure(struct_type = "prody") prody_select = prody.ProdySelect(structure = Protein.structure) #gets the protein's structure from the pdb #The only HETATM remaining are the metal ions Protein.structure = prody_select.protein_and_ions() #Write Protein only pdb Protein.write(file_name = f"{Protein.protein_id}_protein.pdb", struct_type = 'prody') #removes the remaining trash ions Protein = remove_trash_metal_ions.remove_trash_metal_ions(Protein = Protein) #quick patch, will do it better #The structure is put in the reference system of the #inertia tensor orient_obj = orient.Orient(Protein = Protein) _, _, Rot_matrix = orient_obj.calculate_moment_of_intertia_tensor() Protein.structure = orient_obj.base_change_structure() Protein.write() Ligand = Protein.get_ligand_list() for i in range(len(Ligand)): Ligand[i].update_structure(struct_type = "biopython") Ligand[i].structure = orient_obj.base_change_structure(structure = Ligand[i].structure, rot_matrix = Rot_matrix) Ligand[i].write() return Protein
def get_metal_binding_residues_with_no_header(protein_id=None, pdb_file=None, mmcif_file=None, cutoff=3.0, substitutions_dict={}, protein_chain='A', protein_model=0, COM_distance=10.0): """This function iterates through the structure many times in order to return the metal binding residues through a substitution dictionary {residue_id : [residue_name, binding_atom, binding_metal]} it can be used both for pdb files and mmcif files (give the path to the files as a string to pdb_file or mmcif_file) cutoff :: double the maximum distance that a residue's center of mass and a metal ion can have to be considered binding default 3.0 angstrom if you already have a substitution dictionary and you want to update it give it as input as substitutions_dict protein_chain :: string default 'A' this function is slow and error prone and should only be used if there is no mmCIF with a good header It should not be necessary to change COM_distance because it simply is the distance between the center of mass of a residue and the metal that is used to know which atom distances to calculate""" if pdb_file == None and mmcif_file == None: raise ValueError( "I need a pdb_file or a mmcif_file filename cannot both be None type" ) elif pdb_file != None and mmcif_file != None: raise ValueError( f"You can only pass a pdb_file or a mmcif_file not both\npdb_file = {pdb_file} mmcif_file = {mmcif_file}" ) elif pdb_file != None: if protein_id == None: protein_id = pdb_file[0:3] # Get from filename p = Bio.PDB.PDBParser() structure = p.get_structure(protein_id, pdb_file) elif mmcif_file != None: if protein_id == None: protein_id = mmcif_file[0:3] # Get from filename p = Bio.PDB.MMCIFParser() structure = p.get_structure(protein_id, mmcif_file) orient_object = orient.Orient() #select the right model model = structure[protein_model] # select only the right chain chain = model[protein_chain.strip().upper()] for residue in chain: if residue.resname.strip().upper() in important_lists.metals: for atom in residue: #I get a second copy of all the residues in the chain #Will have to refactor and clean this mess if pdb_file != None: tmp_struct = p.get_structure(protein_id, pdb_file) else: tmp_struct = p.get_structure(protein_id, mmcif_file) tmp_struct = tmp_struct[protein_model][ protein_chain.strip().upper()] all_residues = tmp_struct.get_residues() #and iterate though them for other_residue in all_residues: #I avoid scanning the metal against it's self and against trash residues if other_residue.resname.strip().upper( ) not in important_lists.metals: COM_1, COM_2, distance = orient_object.center_mass_distance( structure_1=residue, structure_2=other_residue) if distance <= COM_distance: TMP_atom_dist = [1.E+20, 'DUMMY'] #check for the nearest atom of the binding residue for other_atom in other_residue: d = (atom.coord[0] - other_atom.coord[0] )**2. + (atom.coord[1] - other_atom.coord[1])**2. + ( atom.coord[2] - other_atom.coord[2])**2. d = d**(0.5) if d < TMP_atom_dist[0]: try: TMP_atom_dist = [ d, other_atom.name.upper() ] except: TMP_atom_dist = [ d, other_atom.element.upper() ] #checking if the nearest atom is near enough to be part of a binding residue if TMP_atom_dist[0] <= cutoff: #I add the other residue _id to the dictionary keys and give a value substitutions_dict[str( other_residue.id[1])] = [ other_residue.resname.strip().upper(), TMP_atom_dist[1], residue.resname.strip().upper() ] #useless variables COM_1 = None COM_2 = None return substitutions_dict