def squared_losses(self): """ Returns a dataframe with squared losses per experiment and molecule (can be used to compute, e.g., Brier score). Returns a pandas dataframe (expid, molid) -> loss. Rows and cols appear sorted. Loss is squared loss: (label - score) ** 2 NaNs (failed experiments) are removed. It would look like this: |----------------------------| | | mol1 | mol2 | ... | |----------------------------| | expid1 | 0.75 | 0.23 | ... | | expid2 | 0.31 | 0.33 | ... | | ... | ... | ... | ... | |----------------------------| :rtype: pandas.DataFrame """ if self._df_losses is None: dfl, _ = read_losses(dset=self.dset_id, expids=self.expids, feats=self.feats, model=self.model, lso=self.lso, calibration=self.score_norm) self._df_losses = dfl.sort_index(axis=0).sort_index(axis=1).dropna(axis=1) return self._df_losses
def generate_df_results(molid, importances, dset, feats, model, calibration, lso): cooccurrences, molids, expids, folds = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for other_molid in molids: if other_molid == molid: continue dico_for_df[other_molid] = {'importance': importances[np.where(molids == other_molid)[0][0]], 'cooc_loss': average_loss(molid, other_molid, cooccurrences, molids, expids, df_losses), 'smiles': MolToSmiles(MRDK.molid2mol(other_molid))} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['molid'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'smiles', 'cooc_loss']]
def get_y(molid, dset, feats, model, calibration=None, lso=True): """ Given a molid and experiment coordinates to retrieve the loss matrix, returns a numpy array of all the losses for the given molid (4095 expids) """ df_losses, _ = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso) y = np.array(df_losses[molid]) expids = df_losses.index return y, expids
def average_loss(dset, feats, model, lso, calibration): """ At each expid, we get 1 loss per molecule. We average this. Then we average across all expids Returns: a dataframe with one row per expid,foldid and the average loss as a column the average of all losses across all expids the standard deviation of all losses across all expids """ df_losses, df_folds = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) #print df_folds df_mean = df_losses.mean(axis=1) # for each expid, we get the average loss across all molecules total_mean = df_mean.mean(axis=0) total_std = df_mean.std(axis=0) # Now what we really want is how each split performed, so we need to add a mask at each expid depending on the fold # so as to get a expid,foldid: mean loss for each fold of each expid df_means_by_fold = [df_losses[df_folds==fold].mean(axis=1) for fold in range(10)] big_df = pd.concat(df_means_by_fold, axis=1) # df with expids rows and 10 columns (max 10 folds per expids) big_df['expid'] = big_df.index # copy index so it does not get lost during melting tidy_df = pd.melt(big_df, value_name='mean loss', var_name='foldid', id_vars='expid').dropna() return tidy_df, total_mean, total_std
def generate_df_results_source(molid, importances, dset, feats, model, calibration, lso): cooccurrences, sources, expids, folds = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for src in sources: if src == MRDK.molid2source(molid): continue dico_for_df[src] = {'importance': importances[np.where(sources == src)[0][0]], 'cooc_loss': average_loss_source(molid, src, cooccurrences, sources, expids, df_losses, dset)} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['source'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'cooc_loss']]