def do_the_job(dset, feats, model, calibration=None, lso=True, regression_model=('linreg', LinearRegression), results_dir=op.join(MANYSOURCES_DATA_ROOT, 'results', 'loss_by_cooc'), n_jobs=None, by_source=False): rm_name, rm_factory = regression_model results_dir = op.join(results_dir, 'dset=%s' % dset, 'feats=%s' % feats, 'model=%s' % model, 'calibration=%s' % calibration, 'LSO=%r' % lso, 'reg_model=%s' % rm_name, 'bysource=%r' %by_source) ensure_dir(results_dir) _, molids, _, _ = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) if n_jobs is None: n_jobs = cpu_count() Parallel(n_jobs=n_jobs)(delayed(do_for_one_molid)(calibration, dset, feats, lso, model, molid, results_dir, rm_factory, by_source) for molid in sorted(molids))
def rdkdescs(self, keep_with_missing=False): if self._rdkdescs is None: rdkdescs_file = op.join(manysources_dataset_root(self.name), '02-rdkdescs', '%s.rdkdescs.h5' % self.name) if not op.exists(rdkdescs_file): computer = RDKitDescripter() descs = computer.compute(self.mols().mols()) molids = self.mols().molids() # We assume all mols have descriptors computed, even if missing fnames = computer.fnames() ensure_dir(op.dirname(rdkdescs_file)) with h5py.File(rdkdescs_file, 'w-') as h5: h5['rdkdescs'] = descs h5['molids'] = molids h5['fnames'] = fnames with open(op.join(op.dirname(rdkdescs_file), 'config.json'), 'w') as writer: writer.write(computer.what().id()) with h5py.File(rdkdescs_file, 'r') as h5: self._rdkdescs = h5['rdkdescs'][:] self._rdkdescs_molids = h5['molids'][:] self._rdkdescs_fnames = h5['fnames'][:] if not keep_with_missing: to_keep = np.all(np.isfinite(self._rdkdescs), axis=1) self._rdkdescs = self._rdkdescs[to_keep, :] self._rdkdescs_molids = self._rdkdescs_molids[to_keep] return self._rdkdescs_molids, \ self._rdkdescs_fnames, \ self._rdkdescs, \ self.Y(self._rdkdescs_molids)
def mols(self): if self._molecules is None: pickle_file = op.join(manysources_dataset_root(self.name), '01-molspickle', '%s.pickled.gz' % self.name) if not op.isfile(pickle_file): self._molecules = MANYSOURCES_MOLECULES.get(self.name, lambda: None)() ensure_dir(op.dirname(pickle_file)) self._molecules.to_pickle(pickle_file, compressed=True) with open(op.join(op.dirname(pickle_file), 'config.json'), 'w') as writer: writer.write(self._molecules.what().id()) else: self._molecules = MoleculesFromRDKit.from_pickle(pickle_file, compressed=True) return self._molecules
def _unfolded_ecfps(self): if self._ecfps is None: ecfp_file = op.join(manysources_dataset_root(self.name), '02-ecfps', '%s.ecfps.h5' % self.name) if not op.exists(ecfp_file): fingerprinter = RDKMorganFingerprinter() for molid, mol in self.mols(): print molid fingerprinter.add_mol(molid, mol) ensure_dir(op.dirname(ecfp_file)) with open(op.join(op.dirname(ecfp_file), 'config.json'), 'w') as writer: writer.write(fingerprinter.what().id()) fingerprinter.fingerprints().save(ecfp_file) self._ecfps = UnfoldedFingerprints.load(ecfp_file) return self._ecfps
def _unfolded_ecfps_nodupes(self): if self._ecfps_no_dupes is None: ecfp_nodupes_file = op.join(manysources_dataset_root(self.name), '03-ecfps-nodupes', '%s.ecfps.h5' % self.name) if not op.exists(ecfp_nodupes_file): ecfps = self._unfolded_ecfps() ufp = UnfoldedFingerprints(ecfps.molids, ecfps.i2s, zero_dupes(ecfps.csr, by_rows=False), # Hack 1, this should be Configurable failed_molids=ecfps.failed_molids) ensure_dir(op.dirname(ecfp_nodupes_file)) with open(op.join(op.dirname(ecfp_nodupes_file), 'config.txt'), 'w') as writer: writer.write('Same as 02-ecfps, but removed columns that have the same value accross all rows.') ufp.save(ecfp_nodupes_file) self._ecfps = UnfoldedFingerprints.load(ecfp_nodupes_file) return self._ecfps
def split_sdf(input_sdf, output_dir, prefix, nb_mols=10000): mol_iterator = iterate_records_in_text_file(input_sdf) ensure_dir(output_dir) def write_n_mols(fn): mols_counter = 0 with open(fn, "w") as writer: for mol in mol_iterator: writer.write(mol) writer.write("\n") mols_counter += 1 if mols_counter == nb_mols: return True return False def fn(fnum, pad=None): if not pad: return op.join(output_dir, "%s_%d.sdf" % (prefix, fnum)) else: return op.join(output_dir, prefix + "_" + pad + str(fnum) + ".sdf") file_counter = 1 while True: if not write_n_mols(fn(file_counter)): break file_counter += 1 # check if last file is empty (not elegant but easy for me) last_file = op.join(output_dir, "%s_%d.sdf" % (prefix, file_counter)) if op.getsize(last_file) == 0: os.remove(last_file) file_counter -= 1 # rename the files with 0 padding so that we keep numerical order padding = len(str(file_counter)) files_to_rename = glob.glob(op.join(output_dir, "*.sdf")) for f in files_to_rename: file_nb = f.split("_")[-1] file_nb = file_nb.split(".")[0] pad = "0" * (padding - len(str(file_nb))) if len(pad) > 0: os.rename(f, fn(file_nb, pad=pad))
def mean_loss_matrix(self, mols_in_rows=True, rows_in_train=False): """ Returns two dataframes: mean loss and number of occurrences. These dataframes: - have mols or sources in rows (depending of *mols_in_rows*) - have mols in columns - each entry contains the mean loss (and count) when * the mol in col is in test * whatever in the row is in train or test (depending on *rows_in_train*) They would look like this: |-----------------------------------------| | index | data | |-----------------|-----------------------| | source or molid | molid1 | molid2 | ... | |-----------------------------------------| | molid1 | 0.83 | 0.02 | ... | | molid2 | 0.17 | 0.01 | ... | | ... | ... | ... | ... | |-----------------------------------------| |-----------------------------------------| | index | data | |-----------------|-----------------------| | source or molid | molid1 | molid2 | ... | |-----------------------------------------| | molid1 | 794 | 733 | ... | | molid2 | 680 | 667 | ... | | ... | ... | ... | ... | |-----------------------------------------| :rtype: (pandas.DataFrame, pandas.DataFrame) """ matid = 'mir=%r#rit=%r' % (mols_in_rows, rows_in_train) cache_dir = op.join(MANYSOURCES_DATA_ROOT, 'results', 'mlms', self.what().id(maxlength=1)) ensure_dir(cache_dir) cache_file = op.join(cache_dir, matid + '.pkl') if not op.isfile(cache_file): # row masks rows_df = self.mcoocs() if mols_in_rows else self.scoocs() rows_df = ~rows_df if rows_in_train else rows_df # col masks cols_df = self.mcoocs() # losses loss_df = self.squared_losses().T # sanity checks for columns assert np.all(cols_df.columns == loss_df.index) @jit(nopython=True) def update_cooc_cooc(res_matrix, norm_matrix, losses, rows, cols): for row in rows: for col in cols: res_matrix[row, col] += losses[col] norm_matrix[row, col] += 1 loss_matrix = np.zeros((len(rows_df.columns), len(cols_df.columns))) norm_matrix = np.zeros((len(rows_df.columns), len(cols_df.columns))) for expid in loss_df.columns: print 'Computing mean loss for expid=%d' % expid if expid not in rows_df.index.levels[0]: print '\t Cannot find coocurrences, skipping...' continue exp_losses = loss_df[expid] # # FIXME: this does not seem right now that we have a multilevel df # for rowmask, colmask in izip(rows_df.loc[expid].values, cols_df.loc[expid].values): update_cooc_cooc(loss_matrix, norm_matrix, exp_losses.values, np.where(rowmask)[0], np.where(colmask)[0]) losses_df = pd.DataFrame(loss_matrix / norm_matrix, index=sorted(rows_df.columns), columns=cols_df.columns) counts_df = pd.DataFrame(norm_matrix, index=sorted(rows_df.columns), columns=cols_df.columns) pd.to_pickle((losses_df, counts_df), cache_file) return pd.read_pickle(cache_file)
def do_for_one_molid(calibration, dset, feats, lso, model, molid, results_dir, rm_factory, by_source=False): print molid MRDK = ManysourcesDataset(dset).mols() # FIXME: this is read on each job, so once per molecule ATM... # Train and evaluate the model y, expids = get_y(molid, dset, feats, model, calibration, lso) if not by_source: X = get_X(molid, expids, dset, feats, model, lso) else: X = get_X_source(molid, expids, dset, feats, model) # makes no sense to run by source on LSO=False X = ~X # coocurrences in train, less sparse, but better interpretation unless we tweak well the numbers... rsquared, feat_weights, trained_model = build_and_validate_regression_model(X, y, model_factory=rm_factory) rsquared = float(rsquared) # REMOVE moldir shows r2 moldir = op.join(results_dir, 'r2=%.2f__%s' % (rsquared, molid)) ensure_dir(moldir) # Save the model pd.to_pickle(trained_model, op.join(moldir, 'model_trained_rsquare=%.2f.pkl' % rsquared)) # Save the smiles smiles = MolToSmiles(MRDK.molid2mol(molid)) with open(op.join(moldir, 'smiles.txt'), 'w') as writer: writer.write(smiles) # Save the molecule-influence table if not by_source: df = generate_df_results(molid, feat_weights, dset, feats, model, calibration, lso) pd.to_pickle(df, op.join(moldir, 'results_df.pkl')) df.loc[molid] = (1E16, rsquared, smiles, np.mean(y)) # FIXME df['label'] = map(MRDK.molid2label, df.index) df = df[['label', 'relabsimportance', 'importance', 'smiles', 'cooc_loss']] df = df.sort('relabsimportance', ascending=False) df.head(20).to_html(op.join(moldir, 'results_df.html')) else: df = generate_df_results_source(molid, feat_weights, dset, feats, model, calibration, lso) pd.to_pickle(df, op.join(moldir, 'results_df_bysource.pkl')) df = df.sort('relabsimportance', ascending=False) df.head(20).to_html(op.join(moldir, 'results_df_bysource.html')) # Plot the distribution of losses (y) plt.figure() seaborn.distplot(y, bins=40) plt.xlim((-0.05, 1.05)) plt.title('molid=%s, r2=%.2f' % (molid, rsquared)) plt.savefig(op.join(moldir, 'y_dist.png'), bbox_inches='tight') plt.close() # --- WIP gridspec with chemdeco pics and things like that if not by_source: show_top = 4 gs = gridspec.GridSpec(show_top, 2) fig = plt.figure(figsize=(24, 16)) # Plot the molecule itself ax_mol = fig.add_subplot(gs[0:show_top / 2, 0]) ax_mol.grid(False) ax_mol.get_xaxis().set_ticks([]) ax_mol.get_yaxis().set_ticks([]) mol = MRDK.molid2mol(molid) AllChem.Compute2DCoords(mol) ax_mol.imshow(artdeco2(rdkit2im(mol, size=(400, 400)), color='red' if df.loc[molid]['label'] == 'INHIBITOR' else 'green', chorrada=5)) # Plot the distribution of losses ax_distr = fig.add_subplot(gs[show_top / 2:0, 0]) seaborn.distplot(y, bins=40, ax=ax_distr) # Plot the top (we should align all to a common scaffold and maybe highlight substructures that matter) for rank, (inf_molid, row) in enumerate(df.iloc[1:show_top + 1].iterrows()): ax_influential_mol = fig.add_subplot(gs[rank, 1]) ax_influential_mol.grid(False) ax_influential_mol.get_xaxis().set_ticks([]) ax_influential_mol.get_yaxis().set_ticks([]) mol_color = 'red' if row['label'] == 'INHIBITOR' else 'green' good_or_bad_color = 'red' if row['importance'] > 0 else 'green' # add decos mol = MRDK.molid2mol(inf_molid) AllChem.Compute2DCoords(mol) image = rdkit2im(mol) image = artdeco1(image, decos=(('black', good_or_bad_color),)) image = artdeco2(image, color=mol_color) ax_influential_mol.imshow(image) ax_influential_mol.set_title('%s, inf=%.4f, cooc_loss=%.4f' % (inf_molid, row['importance'], row['cooc_loss'])) # FIXME: cooc_loss also with stddev and standard error fig.suptitle('%s, r2=%.2f, cooc_loss=%.4f +/- %.4f' % (molid, rsquared, float(np.mean(y)), float(np.std(y)))) plt.savefig(op.join(moldir, 'verde_que_te_quiero_verde.png'), bbox_inches='tight') plt.close()
def generate_lsocv_results(dset='bcrp', model='logreg1', feats='ecfps1', expids=(0,), compression='lzf', reraise=False): """Runs train/test experiments for the "manysources" experiments, saving the results in disk. Each individual experiment is saved to an HDF5 file. Afterwards they can be merged. The generated HDF5 files have the following structure: /dsets /bcrp{config} /models /logreg1{config} /featss /ecfps1{config} /dset='bcrp' /feats='ecps1' /ids=[molid1, molid2, ..., molidn] /expid=3 /lsocv{config, num_folds} /fold0'{config} /ext_indices /ext_scores /y_ext /DONE or FAILED /model='logreg3'{config, train_time, test_time, auc} /model_data1 /model_data2 /... /fold=1... /... /crscv#seed=3{config} /*same as lsocv* Parameters ---------- dset: string, default 'bcrp' the name of the dataset model: string, default 'logreg1' the name of the model configuration feats: string, default 'ecfp1' the name of the molecular features expids: iterable of ints, default (1,) the experiment ids to carry compression: string, default 'lzf' the compression used to store arrays in the HDF5 file; if None, no compression will be used reraise: boolean, default False if True, exceptions are raised; if not, they are ignored and results keep being generated Returns ------- Nothing, but prints a cool DONE and saves the results in disk. """ def train_test(model, X, y, train_indices, trest_indices): # Split Xtrain, ytrain, Xtrest, ytrest = \ X[train_indices, :], y[train_indices], \ X[trest_indices, :], y[trest_indices] # Train model = copy(model) start = time() model.fit(Xtrain, ytrain) train_time = time() - start # Test start = time() scores = model.predict_proba(Xtrest)[:, 1] test_time = time() - start return scores, model, train_time, test_time def generate_split_result(model_config, X, y, split_id, splitter, cvgroup, reraise=False): # Splitting train_indices, trest_indices = splitter.split() fold_group = cvgroup.require_group(split_id) try: fold_group.attrs['config'] = splitter.what().id() fold_group.create_dataset('test_indices', data=trest_indices, compression=compression) # uncompressible fold_group.create_dataset('y_test', data=y[trest_indices], compression=compression) except: pass # Dodgy # Model configuration model_group = fold_group.require_group('model=%s' % model_config.nickname) try: # already done? if 'DONE' in fold_group.keys(): print '%s already done, skipping...' % model_group.name return if 'FAILED' in fold_group.keys(): print '%s already failed, skipping...' % model_group.name return # compute the result scores, model, train_time, test_time = \ train_test(model_config.seed_model(expid), X, y, train_indices, trest_indices) # save scores, auc, times try: model_group.attrs['auc'] = roc_auc_score(y[trest_indices], scores) except: model_group.attrs['auc'] = None model_group.attrs['train_time'] = train_time model_group.attrs['test_time'] = test_time model_group.create_dataset('test_scores', data=scores, compression=compression) # save whatever from the model model_config.storer.to_hdf5(model, model_group, compression=compression) # done model_group['DONE'] = 'Finished on %s' % strftime("%c") except Exception: model_group['FAILED'] = format_exc() if reraise: raise # Dataset hub dset = dset if isinstance(dset, ManysourcesDataset) else ManysourcesDataset(dset) # Features configuration feats_config = MANYSOURCES_FEATS[feats] molids, X, y = feats_config.extractor(dset) # Model configuration model_config = MANYSOURCES_MODELS[model] for expid in expids: print 'expid=%d' % expid lsocv = setup_splitters_lsocv(dset=dset, molids=molids, expids=(expid,))[0] h5_file = single_hdf5_per_exp(expid=expid, dset=dset.name, model=model, feats=feats) ensure_dir(op.dirname(h5_file)) with h5py.File(h5_file) as h5: try: # coordinates try: h5.require_group('dsets/%s' % dset.name).attrs['config'] = dset.name h5.require_group('featss/%s' % feats).attrs['config'] = feats_config.name h5.require_group('models/%s' % model).attrs['config'] = model_config_string(model_config.model) except: pass # Dodgy # dset coordinates dset_group = h5.require_group('/dset=%s' % dset.name) # features coordinates feat_group = dset_group.require_group('feats=%s' % feats) try: feat_group.create_dataset('ids', data=molids, compression='lzf') except: pass # Data already there # expid group expid_group = feat_group.require_group('expid=%d' % expid) # LSO-CV lsocv_group = expid_group.require_group('lsocv') lsocv_group.attrs['config'] = lsocv.what().id() lsocv_group.attrs['num_folds'] = lsocv.num_folds lsocv_group.attrs['seed'] = lsocv.seed for split_num, splitter in enumerate(lsocv.splitters()): generate_split_result(model_config, X, y, 'fold=%d' % split_num, splitter, lsocv_group, reraise=reraise) # CRS-CV crscv = crscv_from_lsocv(lsocv) crscv_group = lsocv_group.require_group('crscv#seed=%d' % crscv.seed) crscv_group.attrs['config'] = crscv.what().id() crscv_group.attrs['num_folds'] = len(crscv.fold_sizes) crscv_group.attrs['seed'] = crscv.seed for split_num, splitter in enumerate(crscv.splitters()): generate_split_result(model_config, X, y, 'fold=%d' % split_num, splitter, crscv_group, reraise=reraise) except: if 'TOP_FAIL' not in h5: h5['TOP_FAIL'] = format_exc() if reraise: raise print 'DONE'