def standarize_mol_by_inchi(mol, neutralize=True): newmol = AddHs(mol) sinchi, code, msg = generate_inchi(newmol, FixedH=False, RecMet=False) if neutralize: nsinchi = neutralize_inchi(sinchi) else: nsinchi = sinchi newmol = MolFromInchi(nsinchi, removeHs=False) newmol = AddHs(newmol, explicitOnly=True) return newmol
def xyz_to_rdmol(nxyz, smiles): mol = get_mol(smiles) mol = AddHs(mol) num_atoms = len(nxyz) conformer = Conformer(num_atoms) for i, quad in enumerate(nxyz): conformer.SetAtomPosition(i, quad[1:]) mol.AddConformer(conformer) return mol
def generate_conformers(mol, add_hydrogens=True, rmsd_threshold=2.0, # Arbitrarily selected num_conformers=None, # None means best guess parallelism=None, forcefield=DEFAULT_FORCEFIELD, log=logging): if add_hydrogens: log.info("Adding implicit hydrogens") mol = AddHs(mol) if num_conformers is None: num_conformers = get_num_confs_for_mol(mol) log.info("Attempting to generate {0} conformations with min RMSD of {1:.4f}".format(num_conformers, rmsd_threshold)) orig_conf_ids = EmbedMultipleConfs(mol, numConfs=num_conformers, pruneRmsThresh=rmsd_threshold, ignoreSmoothingFailures=True) # Prevents crashes in some situations log.info("Generated {0} initial conformations".format(len(orig_conf_ids))) log.info("Optimizing and calculating energies using {0}".format(forcefield)) conf_energy = optimize_conformers(mol, interfragment=True, parallelism=parallelism, forcefield=forcefield) sorted_by_energy = sorted(conf_energy.iteritems(), key=operator.itemgetter(1)) log.info("Filtering similar conformers") selected = [] min_rmsd, max_rmsd = float('inf'), float('-inf') for idx, id_energy in enumerate(sorted_by_energy): conf_id, energy = id_energy keep = True for comp_id, other_energy in sorted_by_energy[idx+1:]: rmsd = AlignMol(mol, mol, prbCid=comp_id, refCid=conf_id) if rmsd <= rmsd_threshold: mol.RemoveConformer(conf_id) keep = False break else: if rmsd < min_rmsd: min_rmsd = rmsd if rmsd > max_rmsd: max_rmsd = rmsd if keep: selected.append(id_energy) log.debug("Removed {0} after post-optimization RMSD filtering".format(len(orig_conf_ids) - len(selected))) log.info("RMSD: min={0:.4f} max={1:.4f}".format(min_rmsd, max_rmsd)) return mol, selected
def embed_etkdg(mol, seed): from rdkit.Chem import AllChem, AddHs mol = AddHs(mol) params = AllChem.ETKDG() params.randomSeed = seed AllChem.EmbedMolecule(mol, params) return mol
def rdkit_3d_descirptors(mols, regex="(NPR1)|(NPR2)|(PMI1)|(PMI2)|(PMI3)|(SpherocityIndex)|(InertialShapeFactor)|" "(Eccentricity)|(Asphericity)"): """ embeds molecules in 3D and calculates a set of RDKit descriptors for given molecules (RDKit) ``mols`` :param mols: {str} RDKit molecules :param regex: {str} regular expression to match RDKit functions :return: {pandas DataFrame} descriptor names and values """ # embed molecules in 3D mols = [AddHs(m) for m in mols] for i, m in enumerate(mols): AllChem.EmbedMolecule(m, AllChem.ETKDG()) # AllChem.MMFFOptimizeMolecule(m) # create results dictionary with descriptors as keys and append list of values for all mols rslt = dict() desc_regex = re.compile(regex) for descriptor in Descriptors3D.__dict__.keys(): if desc_regex.match(descriptor): print("\t%s..." % descriptor) func = getattr(Descriptors3D, descriptor) pbar = ProgressBar() rslt[descriptor] = list() for mol in pbar(mols): rslt[descriptor].append(func(mol)) return pd.DataFrame(rslt)
def calculate(self): """ Check if the SMILES is valid then update the info. :return: RDKit Mol object """ try: m = MolFromSmiles("".join(p.config['long_prefix']) + "".join(self.smiles.element)) self.smiles.properties[p.s_valid] = False if m is not None: m = AddHs(m) AllChem.EmbedMolecule(m) AllChem.UFFOptimizeMolecule(m) self.smiles.properties["InChI"] = MolToInchi(m) except Exception as e: print("Error rdkit : " + repr(e)) m = None if m is not None: self.smiles.properties[p.s_valid] = True with p.lock_update_data: p.tree_info[p.info_good] += 1 self.smiles.properties[p.s_id] = p.tree_info[p.info_good] else: with p.lock_update_data: p.tree_info[p.info_bad] += 1 self.smiles.properties[p.s_id] = p.tree_info[p.info_bad] return m
def add_hs(self, inplace=False, add_coords=True, explicit_only=False, only_on_atoms=False): """ Add hydrogens to self. Args: inplace (bool): Whether to add Hs to `Mol`, or return a new `Mol`. add_coords (bool): Whether to set 3D coordinate for added Hs. explicit_only (bool): Whether to add only explicit Hs, or also implicit ones. only_on_atoms (iterable<bool>): An iterable specifying the atoms to add Hs. Returns: skchem.Mol: `Mol` with Hs added. """ if inplace: msg = 'Inplace addition of Hs is not yet supported.' raise NotImplementedError(msg) raw = AddHs(self, addCoords=add_coords, onlyOnAtoms=only_on_atoms, explicitOnly=explicit_only) return self.__class__.from_super(raw)
def add_hydrogen(cls, mol_in, addCoords=True): """Explicit all hydrogens. :param mol_in: RDKit Mol :param addCoords: Add coordinate to added Hs, bool :return mol_out: RDKit Mol """ return AddHs(mol_in, explicitOnly=False, addCoords=addCoords)
def generate(self, max_generated_conformers=50, prune_thresh=0.01, maxattempts_per_conformer=5, output=None, threads=1): ''' Generates conformers Note the number max_generated _conformers required is related to the number of rotatable bonds ''' self.mol = AddHs(self.mol, addCoords=True) self.initial_confs = EmbedMultipleConfs( self.mol, numConfs=max_generated_conformers, pruneRmsThresh=prune_thresh, maxAttempts=maxattempts_per_conformer, useRandomCoords=False, # Despite what the documentation says -1 is a seed!! # It doesn't mean random generation numThreads=threads, randomSeed=random.randint(1, 10000000)) if len(self.initial_confs) == 0: output.write((f"Generated {len(self.initial_confs)} " "initial confs\n")) output.write((f"Trying again with {max_generated_conformers * 10} " "attempts and random coords\n")) self.initial_confs = EmbedMultipleConfs( self.mol, numConfs=max_generated_conformers, pruneRmsThresh=prune_thresh, useRandomCoords=True, maxAttempts=10 * maxattempts_per_conformer, # Despite what the documentation says -1 is a seed!! # It doesn't mean random # generatrion numThreads=threads, randomSeed=random.randint(1, 10000000)) output.write("Generated " + str(len(self.initial_confs)) + " initial confs\n") return self.initial_confs
def docksmile(smile, filename): ''' coverts a smile string to pdbqt and runs autodock vina, returns the binding energy of its top pose Vina configuration details in config.txt ''' #print(smile, filename) if not isinstance(smile, str): raise TypeError('Input is not a class of string') m = MolFromSmiles(smile) # assert valid smiles if m is None: raise ValueError(smile, 'is not a valid smile string') mh = AddHs(m) embed = AllChem.EmbedMolecule(mh, useRandomCoords=False) #check if rdkit successfully generates structure if embed!=0: print('RDkit fails to embed molecule', smile, '; file:%s.pdb'%filename) return smile, np.nan # generate pdb file #pdb = MolToPDBFile(mh, 'input/'+filename+'.pdb', flavor=4) pdb = MolToPDBBlock(mh, flavor=4) open('/tmp/'+filename+'.pdb', 'w').write(pdb) # convert pdb to pdbqt try: out = subprocess.run([py_path, lig_path, '-l', '/tmp/'+filename+'.pdb', '-o','/tmp/'+filename+'.pdbqt']) except subprocess.CalledProcessError as e: print(e.output) if not os.path.exists('/tmp/'+filename+'.pdbqt'): print("%s does't exist" % (filename+'.pdbqt')) return smile, np.nan try: result = subprocess.run(['sh', './run_spike_open_docking.sh', filename], stdout=subprocess.PIPE) result = result.stdout.decode('utf-8') except subprocess.CalledProcessError as er: print(er.output) print(smile, '; file:%s.pdbqt'%filename) return smile, np.nan #print(filename+'.pdbqt','docking success') # read energy from output energy = np.nan strings = re.split('\n', result) for line in strings: if line[0:4] == ' 1': energy = float(re.split(' +', line)[2]) #print(energy ) return smile, energy
def __init__(self, smiles, forcefield="mmff"): ''' Initialises the class ''' self.mol = MolFromSmiles(smiles) self.full_clusters = [] self.forcefield = forcefield self.conf_energies = [] self.initial_confs = None self.smiles = smiles
def processline(t, step, line): global lensum if t.incr(): return 1 if step == 0: lensum += len(line) else: m = MolFromSmiles(line) if step == 100: lensum += len(line) elif step == 105: lensum += len(sha256(line).hexdigest()) elif step in (110, 120): with open(tmpname, 'wb+') as f: print(line, file=f) if step == 120: os.fsync(f.fileno()) lensum += os.stat(tmpname).st_size elif step == 210: lensum += m.GetNumAtoms() elif step == 220: lensum += m.GetNumBonds() elif step == 300: lensum += len(MolToSmiles(m)) elif step == 400: lensum += len(MolToMolBlock(m)) elif step == 420: m2 = AddHs(m) EmbedMolecule(m2, randomSeed=2020) m2 = RemoveHs(m2) m2.SetProp("_Name", "test") lensum += len(MolToMolBlock(m2)) elif step == 600: lensum += mol2file(m, 'svg') elif step == 610: lensum += mol2file(m, 'png') else: raise ValueError("Not implemented step " + str(step)) return 0
def get_max_atom_bond_size(smiles_iterator, explicit_hs=True): """ Convienence function to get max_atoms, max_bonds for a set of input SMILES """ max_atoms = 0 max_bonds = 0 for smiles in tqdm(smiles_iterator): mol = MolFromSmiles(smiles) if explicit_hs: mol = AddHs(mol) max_atoms = max([max_atoms, len(mol.GetAtoms())]) max_bonds = max([max_bonds, len(mol.GetBonds())]) return dict(max_atoms=max_atoms, max_bonds=max_bonds * 2)
def rdmols_from_document(document, build_from="inchi", add_hs=True): """ Convert back a document to a set of rdmols. This method is a companion of "as_document". :param document: a document produced by the "as_mongo_document" method, dict :param build_from: the type of depiction to be used to build back the rdmols, str in ["inchi", "smiles"] :param add_hs: add Hs to RDKit mol object, default is True :returns list_list_rdmols: list of list of rdmols """ assert build_from in ["inchi", "smiles"] assert add_hs in [True, False] list_list_rdmols = list() list_stoechiometry = document['list_stoechiometry'] if build_from == 'inchi': for list_inchis in document['list_list_inchis']: list_rdmols = list() for inchi in list_inchis: rd_mol = MolFromInchi(inchi, sanitize=True) if add_hs: rd_mol = AddHs(rd_mol) list_rdmols.append(rd_mol) list_list_rdmols.append(list_rdmols) elif build_from == 'smiles': for list_smiles in document['list_list_smiles']: list_rdmols = list() for smiles in list_smiles: rd_mol = MolFromSmiles(smiles, sanitize=True) if add_hs: rd_mol = AddHs(rd_mol) list_rdmols.append(rd_mol) list_list_rdmols.append(list_rdmols) else: raise NotImplementedError() return list_list_rdmols, list_stoechiometry
def _predict_rt(self, smiles: str) -> Optional[float]: """Predict Retention Time from SMILES string using provided predictor. Parameters ---------- smiles : str SMILES string of input compound. Returns ------- predicted_rt : Optional[float] Predicted retention time, None if errors occur during prediction, for example if certain features of the input compound that are required for the prediction cannot be calculated. """ mol = MolFromSmiles(smiles) mol = AddHs(mol) fp = self.fp_calculator(mol) # Transform dict into array of values (fingerprint) if self.rt_important_features: fp = np.array( [fp[feature] for feature in self.rt_important_features] ).reshape(1, -1) def validate_np_val(val: float) -> bool: """Make sure value is numeric, not NaN, and not infinity. Parameters ---------- val : float Value to check. Returns ------- bool True if input value is numeric, False otherwise. """ if isinstance(val, float) and not np.isnan(val) and not np.isinf(val): return True return False if all([validate_np_val(val) for val in fp[0]]): predicted_rt = self.rt_predictor.predict(fp)[0] else: return None return predicted_rt
def smiles_reaction_matrix(smarts, *sources, **kwargs): sep = kwargs.setdefault('sep', ' ') molValue = int(kwargs.get('molValue', 400)) logValue = float(kwargs.get('logValue', 4.0)) reaction = ReactionFromSmarts(smarts) smilesLists = [load_smiles_file(source) for source in sources] products = reaction_matrix(reaction, *smilesLists) for reactants, product in products: cids = [r.GetProp("_Name") for r in reactants] product_id = '.'.join(cids) for mol in product: smiles = MolToSmiles(mol, isomericSmiles=True) mol.UpdatePropertyCache(strict=False) mh = AddHs(mol, addCoords=True) mwt = MolWt(mol) if mwt <= molValue: logp = MolLogP(mol) if logp < logValue: yield sep.join((smiles, product_id, str(mwt), str(logp)))+"\n"
def construct_feature_matrices(self, smiles, train=True): """ construct a molecule from the given smiles string and return atom and bond classes. Returns dict with entries 'n_atom' : number of atoms in the molecule 'n_bond' : number of bonds in the molecule 'atom' : (n_atom,) length list of atom classes 'bond' : (n_bond,) list of bond classes 'connectivity' : (n_bond, 2) array of source atom, target atom pairs. """ self.atom_tokenizer.train = train self.bond_tokenizer.train = train logger = logging.getLogger(__name__) mol = MolFromSmiles(smiles) if self.explicit_hs: mol = AddHs(mol) n_atom = mol.GetNumAtoms() n_bond = 2 * mol.GetNumBonds() # If its an isolated atom, add a self-link if n_bond == 0: n_bond = 1 logger.warning(f'Found molecule {smiles} with zero bonds') atom_feature_matrix = np.zeros(n_atom, dtype='int') bond_feature_matrix = np.zeros(n_bond, dtype='int') bond_indices = np.zeros(n_bond, dtype='int') connectivity = np.zeros((n_bond, 2), dtype='int') bond_index = 0 for n, atom in enumerate(mol.GetAtoms()): # Atom Classes atom_feature_matrix[n] = self.atom_tokenizer( self.atom_features(atom)) start_index = atom.GetIdx() for bond in atom.GetBonds(): # Is the bond pointing at the target atom rev = bond.GetBeginAtomIdx() != start_index # Bond Classes bond_feature_matrix[bond_index] = self.bond_tokenizer( self.bond_features(bond, flipped=rev)) # Connect edges to original bonds bond_indices[bond_index] = bond.GetIdx() # Connectivity if not rev: # Original direction connectivity[bond_index, 0] = bond.GetBeginAtomIdx() connectivity[bond_index, 1] = bond.GetEndAtomIdx() else: # Reversed connectivity[bond_index, 0] = bond.GetEndAtomIdx() connectivity[bond_index, 1] = bond.GetBeginAtomIdx() bond_index += 1 # Track the largest atom and bonds seen if train: if n_atom > self.max_atoms: self.max_atoms = n_atom if mol.GetNumBonds() > self.max_bonds: self.max_bonds = mol.GetNumBonds() return { 'n_atom': n_atom, 'n_bond': mol.GetNumBonds(), # the real number of bonds 'bond_indices': bond_indices, 'atom': atom_feature_matrix, 'bond': bond_feature_matrix, 'connectivity': connectivity, }
#!/usr/bin/python2 # Little harness for timing how long it takes to embed a molecule # which seems extremely variable on one machine, from __future__ import print_function, division import sys, time, os from rdkit.Chem import MolFromSmiles, AddHs, RemoveHs from rdkit.Chem.AllChem import EmbedMolecule if __name__ == "__main__": dotimestamp = int(os.getenv('MOLEMBED_TIME', '0')) doaddh = int(os.getenv('MOLEMBED_ADDH', '0')) rseed = int(os.getenv('MOLEMBED_SEED', '0')) t0 = time.time() for line in sys.stdin.readlines(): s = line.strip() if dotimestamp: t1 = time.time() dt = (t1 - t0) * 1e3 print('%.3f' % dt, s) t0 = t1 else: print(s) m = MolFromSmiles(s) if doaddh: m2 = AddHs(m) else: m2 = m EmbedMolecule(m2, randomSeed=rseed)
def addhs(self, mol): from rdkit.Chem import AddHs return AddHs(mol)
def construct_feature_matrices(self, smiles): """ construct a molecule from the given smiles string and return atom and bond classes. Returns dict with entries 'n_atom' : number of atoms in the molecule 'n_bond' : number of bonds in the molecule 'atom' : (n_atom,) length list of atom classes 'bond' : (n_bond,) list of bond classes 'connectivity' : (n_bond, 2) array of source atom, target atom pairs. """ mol = MolFromSmiles(smiles) if self.explicit_hs: mol = AddHs(mol) n_atom = len(mol.GetAtoms()) n_bond = 2 * len(mol.GetBonds()) # If its an isolated atom, add a self-link if n_bond == 0: n_bond = 1 atom_feature_matrix = np.zeros(n_atom, dtype='int') bond_feature_matrix = np.zeros(n_bond, dtype='int') connectivity = np.zeros((n_bond, 2), dtype='int') bond_index = 0 atom_seq = mol.GetAtoms() atoms = [atom_seq[i] for i in range(n_atom)] for n, atom in enumerate(atoms): # Atom Classes atom_feature_matrix[n] = self.atom_tokenizer( self.atom_features(atom)) start_index = atom.GetIdx() for bond in atom.GetBonds(): # Is the bond pointing at the target atom rev = bond.GetBeginAtomIdx() != start_index # Bond Classes bond_feature_matrix[bond_index] = self.bond_tokenizer( self.bond_features(bond, flipped=rev)) # Connectivity if not rev: # Original direction connectivity[bond_index, 0] = bond.GetBeginAtomIdx() connectivity[bond_index, 1] = bond.GetEndAtomIdx() else: # Reversed connectivity[bond_index, 0] = bond.GetEndAtomIdx() connectivity[bond_index, 1] = bond.GetBeginAtomIdx() bond_index += 1 return { 'n_atom': n_atom, 'n_bond': n_bond, 'atom': atom_feature_matrix, 'bond': bond_feature_matrix, 'connectivity': connectivity, }
#! /usr/bin/env python import sys from rdkit.Chem import SDMolSupplier, MolToPDBFile, AllChem, AddHs, RemoveHs from rdkit.Chem.Draw import MolsToGridImage spl = SDMolSupplier(sys.argv[1]) mols = [m for m in spl] for i, m in enumerate(mols): m = AddHs(m) AllChem.EmbedMolecule(m, useBasicKnowledge=True, maxAttempts=100) AllChem.MMFFOptimizeMolecule(m) RemoveHs(m) MolToPDBFile(m, 'ligand_%d.pdb' % i) img = MolsToGridImage(mols, legends=["ligand_%d" % i for i in range(len(mols))]) img.save('ligands.png')
class ConformerGenerator(object): ''' Generates conformations of molecules from 2D representation. ''' def __init__(self, smiles, forcefield="mmff"): ''' Initialises the class ''' self.mol = MolFromSmiles(smiles) self.full_clusters = [] self.forcefield = forcefield self.conf_energies = [] self.initial_confs = None self.smiles = smiles def generate(self, max_generated_conformers=50, prune_thresh=0.01, maxattempts_per_conformer=5, output=None, threads=1): ''' Generates conformers Note the number max_generated _conformers required is related to the number of rotatable bonds ''' self.mol = AddHs(self.mol, addCoords=True) self.initial_confs = EmbedMultipleConfs( self.mol, numConfs=max_generated_conformers, pruneRmsThresh=prune_thresh, maxAttempts=maxattempts_per_conformer, useRandomCoords=False, # Despite what the documentation says -1 is a seed!! # It doesn't mean random generation numThreads=threads, randomSeed=random.randint(1, 10000000)) if len(self.initial_confs) == 0: output.write((f"Generated {len(self.initial_confs)} " "initial confs\n")) output.write((f"Trying again with {max_generated_conformers * 10} " "attempts and random coords\n")) self.initial_confs = EmbedMultipleConfs( self.mol, numConfs=max_generated_conformers, pruneRmsThresh=prune_thresh, useRandomCoords=True, maxAttempts=10 * maxattempts_per_conformer, # Despite what the documentation says -1 is a seed!! # It doesn't mean random # generatrion numThreads=threads, randomSeed=random.randint(1, 10000000)) output.write("Generated " + str(len(self.initial_confs)) + " initial confs\n") return self.initial_confs def minimise(self, output=None): ''' Minimises conformers using a force field ''' if "\\" in self.smiles or "/" in self.smiles: output.write(("WARNING: Smiles string contains slashes, " "which specify cis/trans stereochemistry.\n")) output.write(("Force-field minimization may change the " "stereochemistry.\n")) if self.forcefield != "mmff" and self.forcefield != "uff": raise ValueError("Unrecognised force field") if self.forcefield == "mmff": props = MMFFGetMoleculeProperties(self.mol) for i in range(0, len(self.initial_confs)): potential = MMFFGetMoleculeForceField(self.mol, props, confId=i) if potential is None: output.write("MMFF not available, using UFF\n") potential = UFFGetMoleculeForceField(self.mol, confId=i) assert potential is not None output.write(f"Minimising conformer number {i}\n") potential.Minimize() mmff_energy = potential.CalcEnergy() self.conf_energies.append((i, mmff_energy)) elif self.forcefield == "uff": for i in range(0, len(self.initial_confs)): potential = UFFGetMoleculeForceField(self.mol, confId=i) assert potential is not None potential.Minimize() uff_energy = potential.CalcEnergy() self.conf_energies.append((i, uff_energy)) self.conf_energies = sorted(self.conf_energies, key=lambda tup: tup[1]) return self.mol def cluster(self, rms_tolerance=0.1, max_ranked_conformers=10, energy_window=5, Report_e_tol=10, output=None): ''' Removes duplicates after minimization ''' self.counter = 0 self.factormax = 3 self.mol_no_h = RemoveHs(self.mol) calcs_performed = 0 self.full_clusters = [] confs = self.conf_energies[:] ignore = [] ignored = 0 for i, pair_1 in enumerate(confs): if i == 0: index_0, energy_0 = pair_1 output.write((f"clustering cluster {i} of " f"{len(self.conf_energies)}\n")) index_1, energy_1 = pair_1 if abs(energy_1 - energy_0) > Report_e_tol: output.write(("Breaking because hit Report Energy Window, " f"E was {energy_1} kcal/mol " f"and minimum was {energy_0} \n")) break if i in ignore: ignored += i continue self.counter += 1 if self.counter == self.factormax * max_ranked_conformers: output.write('Breaking because hit MaxNConfs \n') break clustered = [[self.mol.GetConformer(id=index_1), energy_1, 0.00]] ignore.append(i) for j, pair_2 in enumerate(confs): if j > 1: index_2, energy_2 = pair_2 if j in ignore: ignored += 1 continue if abs(energy_1 - energy_2) > energy_window: break if abs(energy_1 - energy_2) <= 1e-3: clustered.append([ self.mol.GetConformer(id=index_2), energy_2, 0.00 ]) ignore.append(j) rms = GetConformerRMS(self.mol_no_h, index_1, index_2) calcs_performed += 1 if rms <= rms_tolerance: clustered.append([ self.mol.GetConformer(id=index_2), energy_2, rms ]) ignore.append(j) self.full_clusters.append(clustered) output.write(f"{ignored} ignore passes made\n") output.write((f"{calcs_performed} overlays needed out " f"of a possible {len(self.conf_energies) ** 2}\n")) ranked_clusters = [] for i, cluster in enumerate(self.full_clusters): if i < self.factormax * max_ranked_conformers: ranked_clusters.append(cluster[0]) return ranked_clusters def recluster(self, path, rms_tolerance=0.1, max_ranked_conformers=10, energy_window=5, output=None, clustered_confs=[], molecule=None, key=None, fallback_to_align=False): self.removed = [] self.counter = 0 i = -1 for conf_a in clustered_confs: i += 1 j = i if self.counter == max_ranked_conformers: for k in range(i, len(clustered_confs)): if os.path.isfile(key + "_Conf_" + str(k + 1) + ".xyz"): os.remove(key + "_Conf_" + str(k + 1) + ".xyz") output.write("Removed " + key + "_Conf_" + str(k + 1) + ".xyz\n") break if i in self.removed: continue self.counter += 1 for conf_b in clustered_confs[i + 1:]: j += 1 if conf_b[1] - conf_a[1] > energy_window: break if j in self.removed: continue try: rms = obfit_rmsd(key + "_Conf_" + str(i + 1), key + "_Conf_" + str(j + 1), str(molecule), path=path) except (subprocess.CalledProcessError, ValueError) as e: if fallback_to_align: output.write( 'obfit failed, falling back to obabel --align') output.write(f'Exception {e}\n') try: rms = align_rmsd(f"{key}_Conf_{str(i + 1)}", f"{key}_Conf_{str(j + 1)}", path) except ValueError: continue else: continue output.write("Comparing " + str(i + 1) + " " + str(j + 1) + ' RMSD ' + str(rms) + "\n") if rms > rms_tolerance: pos = _atomic_pos_from_conformer(conf_b[0]) elements = _extract_atomic_type(conf_b[0]) pos = [[-float(coor[k]) for k in range(3)] for coor in pos] coords = list(zip(elements, pos)) filename = os.path.join( path, key + "_Conf_" + str(j + 1) + "_inv.xyz") write_xyz(coords=coords, filename=filename, comment=conf_b[1]) try: file1 = key + "_Conf_" + str(i + 1) file2 = key + "_Conf_" + str(j + 1) + "_inv" rmsinv = obfit_rmsd(file1, file2, str(molecule)) except (subprocess.CalledProcessError, ValueError) as e: if fallback_to_align: output.write( 'obfit failed, falling back to obabel --align') output.write(f'Exception {e}\n') try: i_key = f"{key}_Conf_{str(i + 1)}" inv_key = f"{key}_Conf_{str(j + 1)}_inv" rmsinv = align_rmsd(i_key, inv_key) except ValueError: continue else: continue rms = min([rms, rmsinv]) os.remove(key + "_Conf_" + str(j + 1) + "_inv.xyz") output.write((f"Comparing {i + 1} {j + 1} " f"RMSD after checking inversion {rms}\n")) if rms <= rms_tolerance: self.removed.append(j) output.write("Removed Conf_" + str(j + 1) + "\n") os.remove(key + "_Conf_" + str(j + 1) + ".xyz")
def process(self): data1 = np.load(self.raw_paths[0]) data2 = np.load(self.raw_paths[1]) data1_feed_dict = { 'E': torch.as_tensor(data1['E']), 'N': torch.as_tensor(data1['N']), 'R': torch.as_tensor(data1['R_qm'] if self.qm else data1['R_mmff']), 'D': torch.as_tensor(data1['D_qm'] if self.qm else data1['D_mmff']), 'Q': torch.as_tensor(data1['Q']), 'Z': torch.as_tensor(data1['Z']) } data2_feed_dict = { 'E': torch.as_tensor(data2['E']), 'N': torch.as_tensor(data2['N']), 'R': torch.as_tensor(data2['R_qm'] if self.qm else data2['R_mmff']), 'D': torch.as_tensor(data2['D_qm'] if self.qm else data2['D_mmff']), 'Q': torch.as_tensor(data2['Q']), 'Z': torch.as_tensor(data2['Z']) } data1_size = data1['E'].shape[0] data2_size = data2['E'].shape[0] if not self.sep_heavy_atom: data_size = data1_size + data2_size else: in_part1 = (self.num_heavy_atom < 14) heavy_atom_data = pd.read_csv(self.raw_paths[2] if in_part1 else self.raw_paths[3]) num_heavy_atom = torch.as_tensor(heavy_atom_data['numberHA']).long() atom_mask = (num_heavy_atom == self.num_heavy_atom) atom_mask = atom_mask.view(-1) data_dict_used = data1_feed_dict if in_part1 else data2_feed_dict for key in data_dict_used.keys(): data_dict_used[key] = data_dict_used[key][atom_mask] ''' Here is a trick to make sure later part only calculate data_dict_used ''' data_size = data_dict_used['E'].shape[0] data1_feed_dict = data_dict_used data_array = np.empty(data_size, dtype=Data) for i in tqdm(range(data_size)): data_index = i if i < data1_size else i - data1_size if i < data1_size: tmp_data = _get_ith_data(data_index, **data1_feed_dict) else: tmp_data = _get_ith_data(data_index, **data2_feed_dict) tmp_data = self.pre_transform(tmp_data, edge_version='cutoff', do_sort_edge=True, cal_efg=False, cutoff=self.cutoff, boundary_factor=None, use_center=None, mol=AddHs(MolFromSmiles('C')), cal_3body_term=self.cal_3body_term, bond_atom_sep=self.bond_atom_sep, record_long_range=self.record_long_range) data_array[i] = tmp_data data_list = [data_array[i] for i in range(data_size)] print('collating...') data1, slices = self.collate(data_list) print('saving...') torch.save((data1, slices), self.processed_paths[0])
def _addHs(mol, explicitOnly=False, addCoords=False): return AddHs(mol, explicitOnly=explicitOnly, addCoords=addCoords)
def MolToMol2Block(mol, confId=-1, addHs=False, addCharges=False): """Returns a Mol2 string block for a molecule ARGUMENTS: - mol: the molecule - confId: (optional) selects which conformation to output (-1 = default) if set to None will return all conformers RETURNS: a string """ # # References # - Format specs http://www.tripos.com/data/support/mol2.pdf # - Atom typing http://www.sdsc.edu/CCMS/Packages/cambridge/pluto/atom_types.html # confIds = (confId, ) if confId == None: confIds = Chem.Mol.GetNumConformers() blocks = [] # add explicit hydrogens (since mol2 reader requires them) if addHs: h_coords = mol.GetNumConformers() > 0 and mol.GetConformer(-1).Is3D() try: mol = AddHs(mol, addCoords=h_coords) except RuntimeError: mol = AddHs(mol, addCoords=False) # compute charges if addCharges: ComputeGasteigerCharges(mol) for confId in confIds: molecule = """@<TRIPOS>MOLECULE {} {} {} 0 0 0 SMALL GASTEIGER\n\n""".format( mol.GetProp("_Name") if mol.HasProp("_Name") else "UNK", mol.GetNumAtoms(), mol.GetNumBonds()) # FIXME "USER_CHARGES" could become 'Gasteiger charges' # FIXME "SMALL" means small molecule but could become "PROTEIN" pos = _get_positions(mol, confId) atom_lines = [ "{:>4} {:>4} {:>13.4f} {:>9.4f} {:>9.4f} {:<5} {} {} {:>7.4f}". format(a.GetIdx() + 1, a.GetSymbol(), float(pos[a.GetIdx()][0]), float(pos[a.GetIdx()][1]), float(pos[a.GetIdx()][2]), _sybyl_atom_type(a), 1, "UNL", float(a.GetProp('_GasteigerCharge').replace(',', '.')) if a.HasProp('_GasteigerCharge') else 0.0) for a in mol.GetAtoms() ] atom_lines = ["@<TRIPOS>ATOM"] + atom_lines atom_lines = "\n".join(atom_lines) + "\n" bond_lines = [ "{:>5} {:>5} {:>5} {:>2}".format( bid + 1, b.GetBeginAtomIdx() + 1, b.GetEndAtomIdx() + 1, "ar" if b.GetBondTypeAsDouble() == 1.5 else "am" if _amide_bond(b) else str(int(b.GetBondTypeAsDouble()))) for bid, (b) in enumerate(mol.GetBonds()) ] bond_lines = ["@<TRIPOS>BOND"] + bond_lines + ["\n"] bond_lines = "\n".join(bond_lines) block = molecule + atom_lines + bond_lines blocks.append(block) return "".join(blocks)