def penalized_logP(mol_or_smiles, masked=False, default=-5): mol = get_mol(mol_or_smiles) if mol is None: return default reward = logP(mol) - SA(mol) - get_num_rings_6(mol) if masked and not mol_passes_filters(mol): return default return reward
valid_smiles = valid_smiles[:50] if len(new_features) != 0: new_features = np.vstack(new_features)[:50] else: new_features = np.zeros((0, X_train.shape[1])) os.makedirs(args.save_dir, exist_ok=True) save_object( valid_smiles, os.path.join(args.save_dir, "valid_smiles{}.dat".format(iteration))) scores = [] for i in range(len(valid_smiles)): mol = MolFromSmiles(valid_smiles[i]) current_log_P_value = Descriptors.MolLogP(mol) current_SA_score = -SA(mol) current_cycle_score = max_ring_penalty(mol) current_SA_score_normalized = (current_SA_score - np.mean(SA_scores)) / np.std(SA_scores) current_log_P_value_normalized = ( current_log_P_value - np.mean(logP_values)) / np.std(logP_values) current_cycle_score_normalized = ( current_cycle_score - np.mean(cycle_scores)) / np.std(cycle_scores) score = (current_SA_score_normalized + current_log_P_value_normalized + current_cycle_score_normalized) scores.append(-score) # target is always minused print(f"{len(valid_smiles)} molecules found. Scores: {scores}") save_object(scores,
from rdkit.Chem import Descriptors from rdkit.ML.Descriptors import MoleculeDescriptors nms = [x[0] for x in Descriptors._descList] calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms) from tensorflow.keras.models import load_model from moses.metrics import mol_passes_filters, QED, SA, logP from moses.metrics.utils import get_n_rings, get_mol from rdkit.Chem import Descriptors as des from rdkit.Chem.rdMolDescriptors import CalcNumAtomStereoCenters, CalcNumAmideBonds from desfun import Similarity, User_Fragment descrittori = { 'SA': (lambda x: SA(x)), 'QED': (lambda x: QED(x)), ### rdkit ### 'MolWt': (lambda x: des.MolWt(x)), 'LogP': (lambda x: des.MolLogP(x)), 'MolMR': (lambda x: des.MolMR(x)), 'NumHDonors': (lambda x: des.NumHDonors(x)), 'NumHAcceptors': (lambda x: des.NumHAcceptors(x)), 'HeavyAtomCount': (lambda x: des.HeavyAtomCount(x)), 'NHOHCount': (lambda x: des.NHOHCount(x)), 'NOCount': (lambda x: des.NOCount(x)), 'CalcNumAtomStereoCenters': (lambda x: CalcNumAtomStereoCenters(x)), 'CalcNumAmideBonds': (lambda x: CalcNumAmideBonds(x)), 'NumAliphaticCarbocycles': (lambda x: des.NumAliphaticCarbocycles(x)), 'NumAliphaticHeterocycles': (lambda x: des.NumAliphaticHeterocycles(x)), 'NumAliphaticRings': (lambda x: des.NumAliphaticRings(x)),
print("Preparing dataset...") collate_pad = partial(collate, pad=model.vocab.pad, return_data=True) dataset = StringDataset(model.vocab, smiles) data_loader = DataLoader(dataset, collate_fn=collate_pad, batch_size=512, shuffle=False) print("Getting latent codes...") for batch in tqdm(data_loader): z = model.encode(batch_to_device(batch[:-1], args.device)) mu, _ = model.get_mu_std(z) latent_points.append(mu.detach().cpu().numpy()) romol = [Chem.MolFromSmiles(x.strip()) for x in batch[-1]] logP_values.extend([Descriptors.MolLogP(m) for m in romol]) SA_scores.extend([-SA(m) for m in romol]) cycle_scores.extend([max_ring_penalty(m) for m in romol]) SA_scores = np.array(SA_scores) logP_values = np.array(logP_values) cycle_scores = np.array(cycle_scores) SA_scores_normalized = (SA_scores - SA_scores.mean()) / SA_scores.std() logP_values_normalized = (logP_values - logP_values.mean()) / logP_values.std() cycle_scores_normalized = (cycle_scores - cycle_scores.mean()) / cycle_scores.std() latent_points = np.vstack(latent_points) targets = (SA_scores_normalized + logP_values_normalized + cycle_scores_normalized)