def generate_df_results(molid, importances, dset, feats, model, calibration, lso): cooccurrences, molids, expids, folds = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for other_molid in molids: if other_molid == molid: continue dico_for_df[other_molid] = {'importance': importances[np.where(molids == other_molid)[0][0]], 'cooc_loss': average_loss(molid, other_molid, cooccurrences, molids, expids, df_losses), 'smiles': MolToSmiles(MRDK.molid2mol(other_molid))} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['molid'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'smiles', 'cooc_loss']]
def do_for_one_molid(calibration, dset, feats, lso, model, molid, results_dir, rm_factory, by_source=False): print molid MRDK = ManysourcesDataset(dset).mols() # FIXME: this is read on each job, so once per molecule ATM... # Train and evaluate the model y, expids = get_y(molid, dset, feats, model, calibration, lso) if not by_source: X = get_X(molid, expids, dset, feats, model, lso) else: X = get_X_source(molid, expids, dset, feats, model) # makes no sense to run by source on LSO=False X = ~X # coocurrences in train, less sparse, but better interpretation unless we tweak well the numbers... rsquared, feat_weights, trained_model = build_and_validate_regression_model(X, y, model_factory=rm_factory) rsquared = float(rsquared) # REMOVE moldir shows r2 moldir = op.join(results_dir, 'r2=%.2f__%s' % (rsquared, molid)) ensure_dir(moldir) # Save the model pd.to_pickle(trained_model, op.join(moldir, 'model_trained_rsquare=%.2f.pkl' % rsquared)) # Save the smiles smiles = MolToSmiles(MRDK.molid2mol(molid)) with open(op.join(moldir, 'smiles.txt'), 'w') as writer: writer.write(smiles) # Save the molecule-influence table if not by_source: df = generate_df_results(molid, feat_weights, dset, feats, model, calibration, lso) pd.to_pickle(df, op.join(moldir, 'results_df.pkl')) df.loc[molid] = (1E16, rsquared, smiles, np.mean(y)) # FIXME df['label'] = map(MRDK.molid2label, df.index) df = df[['label', 'relabsimportance', 'importance', 'smiles', 'cooc_loss']] df = df.sort('relabsimportance', ascending=False) df.head(20).to_html(op.join(moldir, 'results_df.html')) else: df = generate_df_results_source(molid, feat_weights, dset, feats, model, calibration, lso) pd.to_pickle(df, op.join(moldir, 'results_df_bysource.pkl')) df = df.sort('relabsimportance', ascending=False) df.head(20).to_html(op.join(moldir, 'results_df_bysource.html')) # Plot the distribution of losses (y) plt.figure() seaborn.distplot(y, bins=40) plt.xlim((-0.05, 1.05)) plt.title('molid=%s, r2=%.2f' % (molid, rsquared)) plt.savefig(op.join(moldir, 'y_dist.png'), bbox_inches='tight') plt.close() # --- WIP gridspec with chemdeco pics and things like that if not by_source: show_top = 4 gs = gridspec.GridSpec(show_top, 2) fig = plt.figure(figsize=(24, 16)) # Plot the molecule itself ax_mol = fig.add_subplot(gs[0:show_top / 2, 0]) ax_mol.grid(False) ax_mol.get_xaxis().set_ticks([]) ax_mol.get_yaxis().set_ticks([]) mol = MRDK.molid2mol(molid) AllChem.Compute2DCoords(mol) ax_mol.imshow(artdeco2(rdkit2im(mol, size=(400, 400)), color='red' if df.loc[molid]['label'] == 'INHIBITOR' else 'green', chorrada=5)) # Plot the distribution of losses ax_distr = fig.add_subplot(gs[show_top / 2:0, 0]) seaborn.distplot(y, bins=40, ax=ax_distr) # Plot the top (we should align all to a common scaffold and maybe highlight substructures that matter) for rank, (inf_molid, row) in enumerate(df.iloc[1:show_top + 1].iterrows()): ax_influential_mol = fig.add_subplot(gs[rank, 1]) ax_influential_mol.grid(False) ax_influential_mol.get_xaxis().set_ticks([]) ax_influential_mol.get_yaxis().set_ticks([]) mol_color = 'red' if row['label'] == 'INHIBITOR' else 'green' good_or_bad_color = 'red' if row['importance'] > 0 else 'green' # add decos mol = MRDK.molid2mol(inf_molid) AllChem.Compute2DCoords(mol) image = rdkit2im(mol) image = artdeco1(image, decos=(('black', good_or_bad_color),)) image = artdeco2(image, color=mol_color) ax_influential_mol.imshow(image) ax_influential_mol.set_title('%s, inf=%.4f, cooc_loss=%.4f' % (inf_molid, row['importance'], row['cooc_loss'])) # FIXME: cooc_loss also with stddev and standard error fig.suptitle('%s, r2=%.2f, cooc_loss=%.4f +/- %.4f' % (molid, rsquared, float(np.mean(y)), float(np.std(y)))) plt.savefig(op.join(moldir, 'verde_que_te_quiero_verde.png'), bbox_inches='tight') plt.close()