Exemple #1
0
    def squared_losses(self):
        """
        Returns a dataframe with squared losses per experiment and molecule (can be used to compute, e.g., Brier score).

        Returns a pandas dataframe (expid, molid) -> loss.
        Rows and cols appear sorted.
        Loss is squared loss: (label - score) ** 2
        NaNs (failed experiments) are removed.


        It would look like this:

        |----------------------------|
        |        | mol1 | mol2 | ... |
        |----------------------------|
        | expid1 | 0.75 | 0.23 | ... |
        | expid2 | 0.31 | 0.33 | ... |
        |  ...   |  ... | ...  | ... |
        |----------------------------|

        :rtype: pandas.DataFrame
        """
        if self._df_losses is None:
            dfl, _ = read_losses(dset=self.dset_id,
                                 expids=self.expids,
                                 feats=self.feats,
                                 model=self.model,
                                 lso=self.lso,
                                 calibration=self.score_norm)
            self._df_losses = dfl.sort_index(axis=0).sort_index(axis=1).dropna(axis=1)
        return self._df_losses
def generate_df_results(molid, importances, dset, feats, model, calibration, lso):
    cooccurrences, molids, expids, folds = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    df_losses, folds_df = read_losses(dset=dset,
                                      feats=feats,
                                      model=model,
                                      calibration=calibration,
                                      lso=lso,
                                      also_folds=True)
    dico_for_df = defaultdict(dict)

    MRDK = ManysourcesDataset(dset).mols()
    for other_molid in molids:
        if other_molid == molid:
            continue
        dico_for_df[other_molid] = {'importance': importances[np.where(molids == other_molid)[0][0]],
                                    'cooc_loss': average_loss(molid,
                                                              other_molid,
                                                              cooccurrences,
                                                              molids,
                                                              expids,
                                                              df_losses),
                                    'smiles': MolToSmiles(MRDK.molid2mol(other_molid))}
    df = pd.DataFrame.from_dict(dico_for_df, orient='index')
    df.index.names = ['molid']
    df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum()
    return df[['relabsimportance', 'importance', 'smiles', 'cooc_loss']]
def get_y(molid, dset, feats, model, calibration=None, lso=True):
    """
    Given a molid and experiment coordinates to retrieve the loss matrix, returns a numpy array of all the losses
    for the given molid (4095 expids)
    """
    df_losses, _ = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso)
    y = np.array(df_losses[molid])
    expids = df_losses.index
    return y, expids
def average_loss(dset, feats, model, lso, calibration):
    """
    At each expid, we get 1 loss per molecule. We average this. Then we average across all expids
    Returns: a dataframe with one row per expid,foldid and the average loss as a column
             the average of all losses across all expids
             the standard deviation of all losses across all expids
    """
    df_losses, df_folds = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso,
                                      also_folds=True)
    #print df_folds
    df_mean = df_losses.mean(axis=1)   # for each expid, we get the average loss across all molecules
    total_mean = df_mean.mean(axis=0)
    total_std = df_mean.std(axis=0)
    # Now what we really want is how each split performed, so we need to add a mask at each expid depending on the fold
    # so as to get a expid,foldid: mean loss for each fold of each expid
    df_means_by_fold = [df_losses[df_folds==fold].mean(axis=1) for fold in range(10)]
    big_df = pd.concat(df_means_by_fold, axis=1)  # df with expids rows and 10 columns (max 10 folds per expids)
    big_df['expid'] = big_df.index   # copy index so it does not get lost during melting
    tidy_df = pd.melt(big_df, value_name='mean loss', var_name='foldid', id_vars='expid').dropna()

    return tidy_df, total_mean, total_std
def generate_df_results_source(molid, importances, dset, feats, model, calibration, lso):
    cooccurrences, sources, expids, folds = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    df_losses, folds_df = read_losses(dset=dset,
                                      feats=feats,
                                      model=model,
                                      calibration=calibration,
                                      lso=lso,
                                      also_folds=True)
    dico_for_df = defaultdict(dict)

    MRDK = ManysourcesDataset(dset).mols()
    for src in sources:
        if src == MRDK.molid2source(molid):
            continue
        dico_for_df[src] = {'importance': importances[np.where(sources == src)[0][0]],
                            'cooc_loss': average_loss_source(molid, src, cooccurrences, sources, expids, df_losses,
                                                             dset)}
    df = pd.DataFrame.from_dict(dico_for_df, orient='index')
    df.index.names = ['source']
    df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum()
    return df[['relabsimportance', 'importance', 'cooc_loss']]