def get_X_source(molid, expids, dset, feats, model, lso=True):
    """
    Given a molid and an experiment coordinate, retrieves the matrix of cooccurrences for the folds when the source of
    that particular molid was in test
    """
    MRDK = ManysourcesDataset(dset).mols()
    cooc, sources, _, _ = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    source_of_molid = MRDK.molid2source(molid)
    index_of_source = np.where(sources == source_of_molid)[0][0]
    col = cooc[:, index_of_source]  # the column on which we put the condition
    interesting_Cooc = cooc[col]  # the matrix X
    # filter out the rows where we had troubles validating the model
    X = interesting_Cooc[expids, :]
    X = np.array(X, dtype=np.int)
    return X
def generate_df_results_source(molid, importances, dset, feats, model, calibration, lso):
    cooccurrences, sources, expids, folds = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    df_losses, folds_df = read_losses(dset=dset,
                                      feats=feats,
                                      model=model,
                                      calibration=calibration,
                                      lso=lso,
                                      also_folds=True)
    dico_for_df = defaultdict(dict)

    MRDK = ManysourcesDataset(dset).mols()
    for src in sources:
        if src == MRDK.molid2source(molid):
            continue
        dico_for_df[src] = {'importance': importances[np.where(sources == src)[0][0]],
                            'cooc_loss': average_loss_source(molid, src, cooccurrences, sources, expids, df_losses,
                                                             dset)}
    df = pd.DataFrame.from_dict(dico_for_df, orient='index')
    df.index.names = ['source']
    df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum()
    return df[['relabsimportance', 'importance', 'cooc_loss']]
Exemple #3
0
    def scoocs(self):
        """
        Returns a multilevel-indexed dataframe of sources coocurrences in test for each partition train/test.

        The dataframe from this palyndromic function has:
          - a sorted index with two levels (expid, fold)
          - a sorted column index, one column per source
          - boolean values
        It would look like this

        |----------------------------------------|
        |     index       |        data          |
        |-----------------|----------------------|
        | expid  | foldid |  src1  | src2  | ... |
        |----------------------------------------|
        |   0    |   0    |  False | False | ... |
        |   1    |   0    |  True  | False | ... |
        | ...    |  ...   |  ...   |  ...  | ... |
        |----------------------------------------|

        :rtype: pandas.DataFrame
        """
        scoocs, sources, expids, folds = sources_coocurrences_df(
            expids=self.expids,
            dset=self.dset_id,
            feats=self.feats,
            model=self.model,
            lso=self.lso
        )
        index = MultiIndex.from_arrays(arrays=(expids, folds))
        index.levels[0].name = 'expid'
        index.levels[1].name = 'fold'
        mcooc_df = pd.DataFrame(data=scoocs,
                                index=index,
                                columns=sources)

        return mcooc_df.sort_index(axis=0).sort_index(axis=1)