def reproduce(seed, samples_path=None, metrics_path=None, n_jobs=1, device='cpu', verbose=False, samples=30000): train = moses.get_dataset('train') model = CombinatorialGenerator(n_jobs=n_jobs) if verbose: print("Training...") model.fit(train) if verbose: print(f"Sampling for seed {seed}") seeds = list(range((seed - 1) * samples, seed * samples)) samples = mapper(n_jobs)(model.generate_one, seeds) if samples_path is not None: with open(samples_path, 'w') as f: f.write('SMILES\n') for sample in samples: f.write(sample + '\n') if verbose: print(f"Computing metrics for seed {seed}") metrics = moses.get_all_metrics(samples, n_jobs=n_jobs, device=device) if metrics_path is not None: with open(metrics_path, 'w') as f: for key, value in metrics.items(): f.write("%s,%f\n" % (key, value)) return samples, metrics
def reproduce(seed, samples_path=None, metrics_path=None, n_jobs=1, device='cpu', verbose=False, samples=30000): data = moses.get_dataset('train')[:100000] if verbose: print("Training...") model = HMM(n_jobs=n_jobs, seed=seed, verbose=verbose) model.fit(data) np.random.seed(seed) if verbose: print(f"Sampling for seed {seed}") np.random.seed(seed) samples = [model.generate_one() for _ in range(samples)] if samples_path is not None: with open(samples_path, 'w') as f: f.write('SMILES\n') for sample in samples: f.write(sample+'\n') if verbose: print(f"Computing metrics for seed {seed}") metrics = moses.get_all_metrics( samples, n_jobs=n_jobs, device=device) if metrics_path is not None: with open(samples_path, 'w') as f: for key, value in metrics.items(): f.write("%s,%f\n" % (key, value)) return samples, metrics
def download_moses(): script_dir = os.path.dirname(os.path.realpath(__file__)) print('>>> Loading data from moses') train = moses.get_dataset('train') test = moses.get_dataset('test') test_scaffolds = moses.get_dataset('test_scaffolds') train = pd.DataFrame(train).rename(columns={0: 'smiles'}) test = pd.DataFrame(test).rename(columns={0: 'smiles'}) scaf = pd.DataFrame(test_scaffolds).rename(columns={0: 'smiles'}) print(scaf.head()) print('>>> Saving data to csv files in ./data') train.to_csv(os.path.join(script_dir, '../data/moses_train.csv')) test.to_csv(os.path.join(script_dir, '../data/moses_test.csv')) scaf.to_csv(os.path.join(script_dir, '../data/moses_test_scaffolds.csv'))
def reproduce(seed, samples_path=None, metrics_path=None, n_jobs=1, device='cpu', verbose=False, samples=30000): data = moses.get_dataset('train') model = NGram(10, verbose=verbose) model.fit(data) np.random.seed(seed) smiles = model.generate(samples, l_smooth=0.01) metrics = moses.get_all_metrics(smiles, n_jobs=n_jobs, device=device) if samples_path is not None: with open(samples_path, 'w') as out: out.write('SMILES\n') for s in smiles: out.write(s+'\n') if metrics_path is not None: with open(metrics_path, 'w') as out: for key, value in metrics.items(): out.write("%s,%f\n" % (key, value)) return smiles, metrics
def setUp(self): self.train = moses.get_dataset('train')
import moses train = moses.get_dataset('train')
default='images/', help='Store images in this folder') return parser if __name__ == "__main__": disable_rdkit_log() parser = get_parser() config, unknown = parser.parse_known_args() if len(unknown) != 0: raise ValueError("Unknown argument " + unknown[0]) os.makedirs(config.img_folder, exist_ok=True) generated = OrderedDict( {'MOSES': pd.DataFrame({'SMILES': get_dataset('test')})}) models = pd.read_csv(config.config) for path, name in zip(models['path'], models['name']): generated[name] = pd.read_csv(path) metrics = {'weight': weight, 'logP': logP, 'SA': SA, 'QED': QED} for s in generated.values(): s['ROMol'] = mapper(config.n_jobs)(get_mol, s['SMILES']) distributions = OrderedDict() for metric_name, metric_fn in metrics.items(): distributions[metric_name] = OrderedDict() for _set, _molecules in generated.items(): distributions[metric_name][_set] = mapper(config.n_jobs)( metric_fn, _molecules['ROMol'].dropna().values)
import moses from rdkit import Chem from rdkit.Chem import Draw import numpy as np from tqdm import tqdm train = moses.get_dataset('train') test = moses.get_dataset('test') dataset = np.hstack([train, test]) #sumar los dos arrays que tenemos #test_scaffolds = moses.get_dataset('test_scaffolds') print("Number of molecules to train: ", len(train), "Number of molecules to test: ", len(test)) smiles_train = train[0] def image_molecule(smiles, title = "molecule.png"): mol_train = Chem.MolFromSmiles(smiles) Draw.MolToFile(mol_train, title) return title def smiles_number_to_smiles_string(input_molecule, dict): smile_char = [dict_number_to_char[number] for number in smile ] smile_string = "".join(smile_char) return smile_string molecule1 = image_molecule(smiles_train)