def compound_identity( query: Molmap, target: Optional[Mapping[str, Mol]]) -> Mapping[str, List[str]]: target_set = set((target if target is not None else query).keys()) match_sets = {q: target_set.copy() for q in query.keys()} for fp_type in ["morgan", "topological"]: query_arena = make_fingerprint_arena(query, fingerprint_type=fp_type) target_arena = (make_fingerprint_arena( target, fingerprint_type=fp_type) if target is not None else None) matches = find_similarity_matches(query_arena, target_arena, threshold=1) for q in match_sets.keys(): match_sets[q] &= set(x for x in matches.get(q, {}).keys()) query_weights = {k: MolWt(m) for k, m in query.items()} target_weights = ({k: MolWt(m) for k, m in target.items()} if target is not None else query_weights) for q, ts in match_sets.items(): weight_matches = set() for t in ts: if isclose(query_weights[q], target_weights[t], rel_tol=0.001): weight_matches.add(t) match_sets[q] &= weight_matches return {k: list(v) for k, v in match_sets.items()}
def keep_biggest(cls, mol_in): """Strip small fragments from compound. Returns a new compound where only the "biggest" fragment is conserved according to (i) the number of non-Hs atoms and if there is tie then according to (ii) the molecular weight. :param mol_in: RDKit Mol :return mol_out: new RDKit Mol having only one connected component """ def count_non_hs_atom(mol): ans = 0 for atm in mol.GetAtoms(): if atm.GetAtomicNum() != 1: ans += 1 return ans # Remove "other" molecules molfrag = GetMolFrags(mol_in, asMols=True, sanitizeFrags=False) mol_out = mol_in if len(molfrag) > 1: accepted_nbr_atm = 0 # flag number of atoms in fragment accepted_mw = 0 # flag the molecular weight of the biggest fragment for f in molfrag: nbr_atm = count_non_hs_atom(f) if nbr_atm > accepted_nbr_atm or (nbr_atm == accepted_nbr_atm and MolWt(f) > accepted_mass): accepted_nbr_atm = nbr_atm accepted_mass = MolWt(f) mol_out = f # keep only the biggest fragment cls._copy_properties(mol_in, mol_out) # save the name and stuff return mol_out
def calculate_mass_route(): """Calculate compound molecular mass. --- post: summary: Calculate compound molecular mass. requestBody: required: true content: application/json: schema: CalculateMassSchema responses: '200': content: application/json: schema: CalculateMassResultSchema """ data = CalculateMassSchema().load(request.json) compounds, skipped = convert_compound_request(data["compounds"]) mass_out = {} for n, m in compounds.items(): try: mass_out[n] = MolWt(m) except Exception as e: skipped.append(n) out = {"mass": mass_out, "skipped": skipped} CalculateMassResultSchema().validate(out) return out
def evaluate(self, lst_in): """ Evaluate structure alerts on a list of SMILES :param lst_in: input list of [SMILES, Name] :return: list of alerts matched or "OK" """ smiles, name = lst_in mol = Chem.MolFromSmiles(smiles) if mol is None: return [ smiles, name, 'INVALID', -999, -999, -999, -999, -999, -999 ] desc_list = [ MolWt(mol), MolLogP(mol), NumHDonors(mol), NumHAcceptors(mol), TPSA(mol), CalcNumRotatableBonds(mol) ] for row in self.rule_list: patt, max_val, desc = row if len(mol.GetSubstructMatches(patt)) > max_val: return [smiles, name] + [desc + " > %d" % (max_val)] + desc_list return [smiles, name] + ["OK"] + desc_list
def __call__(self, smiles: str): mol = Chem.MolFromSmiles(smiles) if not (self.rule_dict["MW"][0] <= MolWt(mol) <= self.rule_dict["MW"][1]): return False if not (self.rule_dict["LogP"][0] <= MolLogP(mol) <= self.rule_dict["LogP"][1]): return False if not (self.rule_dict["HBD"][0] <= NumHDonors(mol) <= self.rule_dict["HBD"][1]): return False if not (self.rule_dict["HBA"][0] <= NumHAcceptors(mol) <= self.rule_dict["HBA"][1]): return False if not (self.rule_dict["TPSA"][0] <= TPSA(mol) <= self.rule_dict["TPSA"][1]): return False for row in self.rule_list: patt, max_val, desc = row if len(mol.GetSubstructMatches(patt)) > max_val: return False return True
def evaluate(self, point: Any) -> float: """ Evaluate a point. Args: point: point to evaluate. Returns: evaluation for the given point. """ latent_point = torch.tensor([[point]]) batch_latent = latent_point.repeat(1, self.batch, 1) smiles = self.generator.generate_smiles(batch_latent) mweights = [] for smile in smiles: try: mweights.append(MolWt(Chem.MolFromSmiles(smile))) except Exception: logger.warning("MW calculation failed.") if len(mweights) > 0: return 1.0 - exp(-abs(self.target - (sum(mweights) / len(mweights)))) else: return 1.0
def get_similar_molecules(query_descriptors: list, query_smiles: str, mol_inchi: str, database_binary_path: str, database_smiles_path: str, database_id: int, num_to_keep: int): """Celery task that reads database binary files comparing query descriptors Args: query_descriptors (list): USRCAT descriptors of query. query_smiles (str): SMILES representation of the query molecule. database_binary_path (str): Binary file path - must be string, not path, as Path is non-serialisable. database_smiles_path (str): Smiles file path - same as above, must be a string. database_id (int): Int representing database ID, used to cache results against specific databases """ print("Worker running for " + query_smiles) mol = Chem.MolFromSmiles(query_smiles) query_mol_identifier = mol_inchi + "_" + str(database_id) + "_" + str( num_to_keep) # CPP program bellow called for speed of processing ################################################################################### # Build the command line - //Arguments must be # 0: Executable # 1: Binary file location without last .bin extension, so that .bin and .smi file locations can be derived # 2: Number of best to keep # 3-63: USRCAT descriptors of query command_line = [ "/home/ubuntu/similarity_lab/utils/usrcat_binary_reader_similarity_lab", database_binary_path.replace(".bin", ""), str(num_to_keep) ] for i in range(60): command_line.append(str(query_descriptors[i])) process = Popen(command_line, stdout=PIPE) output, err = process.communicate() exit_code = process.wait() lines = output.decode("utf-8").splitlines() with open( Path(app.config['QUERY_SIMILARS_DIRECTORY']) / (query_mol_identifier + ".csv"), "w") as outfile: outfile.write( "Candidate SMILES,USRCAT Score,Morgan Score,eMolecules ID,MW\n") for line in lines: stripped_line = line.strip() candidate_smiles, title_comma_score = stripped_line.split( " ", maxsplit=1) candidate_title, candidate_score = title_comma_score.split(",") candidate_score = float(candidate_score) candidate_mol = Chem.MolFromSmiles(candidate_smiles) candidate_morgan_score = DiceSimilarity( GetMorganFingerprint(candidate_mol, 2), GetMorganFingerprint(mol, 2)) mw = MolWt(candidate_mol) outfile.write( f'{candidate_smiles},{round(candidate_score,3)},{round(candidate_morgan_score,3)},{candidate_title.replace("_1","")},{round(mw,3)}\n' ) print("Worker done")
def check_lipinski(mol): fgs = load_functional_groups() h_donors = Lipinski.NumHDonors(mol.rdmol) h_acceptors = Lipinski.NumHAcceptors(mol.rdmol) log_p = MolLogP(mol.rdmol) wt = MolWt(mol.rdmol) if h_donors <= 5 and h_acceptors <= 5 and log_p < 5: if wt >= 450: mol.join(fgs['terminal_fg'].get_random()) return True, False else: return True, False else: return False, False
def fragments(self) -> List[Fragment]: """Fragments ligand using RDKit. Includes self as first element. Returns: List[Fragment]: Ordered by weight, heaviest first """ reactant = self.rdkit_mol mols = [reactant] products = Reactor().react(reactant) mols.extend(products) mols.sort(key=lambda m: MolWt(m), reverse=True) return [Fragment(self.atomium_mol, mol) for mol in mols]
def __call__(self, mol): """ Returns the QED of a SMILES string or a RdKit molecule. """ # Error handling. if type(mol) == rdkit.Chem.rdchem.Mol: pass elif type(mol) == str: mol = Chem.MolFromSmiles(mol, sanitize=True) if mol is None: raise ValueError("Invalid SMILES string.") else: raise TypeError("Input must be from {str, rdkit.Chem.rdchem.Mol}") return MolWt(mol)
def process_by_folder(fd, inpath): cycle = fd.strip("cycle_") sd = inpath + '/' + fd + '/ranked_designs.sd' if os.path.exists(sd): cir_mols = [PropertyMol(m) for m in Chem.SDMolSupplier(sd)] for m in cir_mols: # Calculate properties for each mol m.SetProp('Cycle', cycle) m.SetProp('MolWeight', str(MolWt(m))) m.SetProp('LogP', str(LogP(m))) m.SetProp('QED', str(QED(m))) m.SetProp('SAS', str(SAS(m))) # Select the highest score design in the cycle # (the first one in the ranked sd file) best_mol = cir_mols[0] return cir_mols, best_mol
def drawmol(s): bsize = 200 tsize = 80 size = (bsize, bsize + tsize) m = Chem.MolFromSmiles(s) #print("wt", MolWt(m)) img, canvas, drawer = Draw.MolToImage(m, size=size, returnCanvas=True) # centerIt=False, drawingTrans=(bsize/2+1,bsize/2+tsize)) font = Font(face='sans', size=12) pos = bsize / 2, bsize, 0 canvas.addCanvasText('%s\r\nMolWt: %g\tTPSA: %g' % (s, MolWt(m), TPSA(m)), pos, font) with open('xx' + s + '.png', 'w') as f: canvas.flush() img.save(f)
def smiles_reaction_matrix(smarts, *sources, **kwargs): sep = kwargs.setdefault('sep', ' ') molValue = int(kwargs.get('molValue', 400)) logValue = float(kwargs.get('logValue', 4.0)) reaction = ReactionFromSmarts(smarts) smilesLists = [load_smiles_file(source) for source in sources] products = reaction_matrix(reaction, *smilesLists) for reactants, product in products: cids = [r.GetProp("_Name") for r in reactants] product_id = '.'.join(cids) for mol in product: smiles = MolToSmiles(mol, isomericSmiles=True) mol.UpdatePropertyCache(strict=False) mh = AddHs(mol, addCoords=True) mwt = MolWt(mol) if mwt <= molValue: logp = MolLogP(mol) if logp < logValue: yield sep.join((smiles, product_id, str(mwt), str(logp)))+"\n"
def cpd_inform(SMILES): """ A function for getting compound information from SMILES string it received a SMILES string and return a dictionary of information consisted of number of C, H, O , N, P, S, X, Degree of Unsaturation and Molecular Weight """ info = [] mol = Chem.rdmolfiles.MolFromSmiles(SMILES) info.append(float(count_C(mol))) info.append(float(count_H(mol))) info.append(float(count_O(mol))) info.append(float(count_N(mol))) info.append(float(count_P(mol))) info.append(float(count_S(mol))) info.append(float(count_X(mol))) info.append((2*info[0] + 2 + info[3] + info[4] - info[6] - info[1])/2) # it is (2*C + 2 + N + P - X - H)/2 info.append(MolWt(mol)) return info
def get_global_features(mol): """Computes global-level features for a molecule. Parameters ---------- mol : rdkit mol Returns ------- [np.ndarray] Global-level features """ # MW, TPSA, logP, n.hdonors mw = MolWt(mol) tpsa = CalcTPSA(mol) logp = MolLogP(mol) n_hdonors = NumHDonors(mol) desc = np.array([mw, tpsa, logp, n_hdonors], dtype=np.float32) return desc
def cal_prop(q, return_dict_prop): nbits = 1024 while True: qqq = q.get() if qqq == 'DONE': # print('proc =', os.getpid()) break idx, smi = qqq # if idx%10000==0: # print(idx) mol = Chem.MolFromSmiles(smi) logP = MolLogP(mol) SAS = sascorer.calculateScore(mol) QED = qed(mol) MW = MolWt(mol) TPSA0 = TPSA(mol) return_dict_prop[idx] = [logP, SAS, QED, MW, TPSA0]
df["res_stnd_SMILES"] = res_stnd_smi_list # Salt Removed from Standardized SMILES error_mask2 = np.any( [df["res_stnd_SMILES"] == "", df["res_stnd_SMILES"] == "-"], axis=0) df3 = df[np.logical_not(error_mask2)] print(df3[df3.duplicated("res_SMILES", False)][["Result", "res_SMILES"]]) # CSV로 하나씩 살피기 df3 = df.drop_duplicates("res_stnd_SMILES") # Drop first duplicated ones # 5. MW 짜르기 mw_lst = [] for i, row in df3.iterrows(): smi = row["res_stnd_SMILES"] mol = MolFromSmiles(smi) mw = MolWt(mol) mw_lst.append(mw) df3["MW"] = mw_lst print(np.mean(df3[df3["MW"] < 40]["Result"] == "P"), np.mean(df3[df3["MW"] > 800]["Result"] == "P")) df3_mw = df3[df3["MW"] > 40] df3_mw = df3_mw[df3_mw["MW"] < 800] df3_mw = df3_mw.reset_index(drop=True) seeds = [1043] #np.random.randint(1,1e4,10) for seed in seeds: trn, test = split_df(df3_mw, seed) # df3,df3_mw trn.to_csv(f"{data_path}/TG471_train_all_stdn_curated_mw_{seed}.csv") test.to_csv(f"{data_path}/TG471_test_all_stdn_curated_mw_{seed}.csv")
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # def str2mol(s): if isinstance(s, str): return Chem.MolFromSmiles(s) else: return s rdkit_funcs = {"QED": lambda x: QED(str2mol(x)), "MOLWT": lambda x: MolWt(str2mol(x)), "SAS": lambda x: calculateScore(str2mol(x)), "LOGP": lambda x: LogP(str2mol(x))} class AttnParams: _params = None def __init__(self): self._params = { "model": None, "data": None, # Data stuff "len_limit": 120, "num_props": 4, "current_epoch": 1, # Training params "epochs": 20,
canvas.addCanvasText('%s\r\nMolWt: %g\tTPSA: %g' % (s, MolWt(m), TPSA(m)), pos, font) with open('xx' + s + '.png', 'w') as f: canvas.flush() img.save(f) if __name__ == '__main__': drawmol('CN1CCC[C@H]1c2cccnc2') drawmol('CC(=O)OC1=CC=CC=C1C(=O)O') drawmol('O1C=C[C@H]([C@H]1O2)c3c2cc(OC)c4c3OC(=O)C5=C4CCC(=O)5') sys.exit(0) # sample code to use new drawing API (older rdkit do not have DrawString) from rdkit.Chem.AllChem import EmbedMolecule assert EmbedMolecule(m) >= 0 x = Draw.rdMolDraw2D.MolDraw2DSVG(200, 250) x.DrawMolecule(m) x.DrawString('Test String', 20, 200) x.FinishDrawing() print(x.GetDrawingText()) # sample code to generate a legend legstr = '' if molname: legstr += molname + '\n' legstr += '%s\nWt=%g LogP=%g TPSA=%g\nHBA=%d HBD=%d RotBond=%d\n' % \ (smiles, MolWt(mol), MolLogP(mol), TPSA(mol), NumHAcceptors(mol), NumHDonors(mol), NumRotatableBonds(mol))
def Girolami(smiles): # Get RDKit molecule mol = Chem.MolFromSmiles(smiles) mol = Chem.AddHs(mol) # Calculate molecular weight M = MolWt(mol) # Iterate over all atoms and get group # contributions group_contributions = 0.0 for atom in mol.GetAtoms(): Z = atom.GetAtomicNum() if Z == 1: group_contributions += 1 elif 3 <= Z <= 9: group_contributions += 2 elif 11 <= Z <= 17: group_contributions += 4 elif 19 <= Z <= 35: group_contributions += 5 elif 37 <= Z <= 53: group_contributions += 7.5 elif 55 <= Z <= 83: group_contributions += 9 else: raise ValueError('The molecule contains atoms for whom contributions are not defined.') # Calculate initial density rho = M / (5 * group_contributions) # Define functional groups for correction alcohol = Chem.MolFromSmarts('[OX2H]') acid = Chem.MolFromSmarts('[CX3](=O)[OX2H1]') amine = Chem.MolFromSmarts('[NX3;H2,H1;!$(NC=O)]') sulfoxide = Chem.MolFromSmarts('[$([#16X3]=[OX1]),$([#16X3+][OX1-])]') sulfone = Chem.MolFromSmarts('[$([#16X4](=[OX1])=[OX1]),$([#16X4+2]([OX1-])[OX1-])]') n_alcohol = len(mol.GetSubstructMatches(alcohol)) n_acid = len(mol.GetSubstructMatches(acid)) n_amine = len(mol.GetSubstructMatches(amine)) n_sulfoxide = len(mol.GetSubstructMatches(sulfoxide)) n_sulfone = len(mol.GetSubstructMatches(sulfone)) # Find rings sssr = Chem.GetSymmSSSR(mol) n_rings = len(sssr) n_condrings = 0 if n_rings > 1: for ring in range(n_rings): for other_ring in range(ring, n_rings): r1 = sssr[ring] r2 = sssr[other_ring] t = 0 for j in r1: if j in r2: t += 0 if t >= 2: n_condrings += 2 n_rings -= n_condrings # Define groups for corrections first_group = [n_alcohol, n_acid, n_amine, n_sulfoxide, n_rings] second_group = [n_sulfone] third_group = [n_condrings] # Add corrections correction = 0.0 for n in first_group: if correction + n * 0.1 <= 1.3: correction += n * 0.1 else: return 1.3 * rho for n in second_group: if correction + n * 0.2 <= 1.3: correction += n * 0.2 else: return 1.3 * rho for n in third_group: if correction + n * 0.075 <= 1.3: correction += n * 0.075 else: return 1.3 * rho return (1 + correction) * rho
def _calculate_phys_chem_property(self, mol): return MolWt(mol)
def cal_fp_props(mol): # Wrapper function for multiprocessing mol_props = Props(MolToSmiles(mol), MolWt(mol), LogP(mol), QED(mol), SAS(mol)) fp = enc.encode_mol(mol) return fp, mol_props
def mol_weight_get(smiles): m = Chem.MolFromSmiles(smiles) if (m is None): return ("Could not parse input: " + smiles, 500) return {"smiles": smiles, "molWeight": MolWt(m)}
def calculate(self): w = ExactMolWt(self.mol) if self._exact else MolWt(self.mol) if self._averaged: w /= self.mol.GetNumAtoms() return w
def mol_wt(smiles): """Get molecular weight (in Daltons)""" return MolWt(Chem.MolFromSmiles(smiles))
mws = [] logps = [] nhdonors = [] values = [] dataset = [] for data in list(LABEL_GUIDE.keys()) + ["cyp"]: with open(os.path.join(DATA_PATH, data, f"data_{data}.pt"), "rb") as handle: inchis, v = pickle.load(handle) values.extend(v) for inchi in tqdm(inchis): mol = MolFromInchi(inchi) mws.append(MolWt(mol)) logps.append(MolLogP(mol)) nhdonors.append(NumHDonors(mol)) dataset.append(DATASET_GUIDE[data]) df = pd.DataFrame({ "Molecular weight (gr./mol)": mws, r"aLog$P$": logps, "No. hydrogen donors": nhdonors, "values": values, "dataset": dataset, }) f, axs = plt.subplots(1, 3, figsize=(18, 6)) axs[0].grid(alpha=0.5)
df = pd.read_hdf(H5_FILE, h5_table) idx = np.loadtxt(good_dirs, dtype=np.str) iidx = [int(i[2:]) for i in idx] smiles = df.loc[iidx]['smiles'] except: raise Exception('lock section error') finally: lock.release() n = 0 for i, smile in zip(idx, smiles): n += 1 # can prepend MolWt with Exact m = Chem.MolFromSmiles(smile) m = Chem.AddHs(m) mw = MolWt(m) if n < 11: print('molecule: {} smile: {} mw: {}'.format(i, smile, mw)) mws.append(mw) print('processed {} molecules!'.format(len(smiles))) # plot historgrams from plot_settings import * def plot_hist(save_path, data, xlabel=None, ylabel=None, label=None,