def get_X_source(molid, expids, dset, feats, model, lso=True):
    """
    Given a molid and an experiment coordinate, retrieves the matrix of cooccurrences for the folds when the source of
    that particular molid was in test
    """
    MRDK = ManysourcesDataset(dset).mols()
    cooc, sources, _, _ = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    source_of_molid = MRDK.molid2source(molid)
    index_of_source = np.where(sources == source_of_molid)[0][0]
    col = cooc[:, index_of_source]  # the column on which we put the condition
    interesting_Cooc = cooc[col]  # the matrix X
    # filter out the rows where we had troubles validating the model
    X = interesting_Cooc[expids, :]
    X = np.array(X, dtype=np.int)
    return X
def average_loss_source(molid, source, scoocs, sources, expids, losses_df, dset):
    """
    Molid is the target molid (the one for which we have built the model)
    source2 is the source for which we know its importance for the molid1
    We want to compute the average loss of molid1 when source2 is in test
    """
    MRDK = ManysourcesDataset(dset).mols()
    source1 = MRDK.molid2source(molid)
    # sources index  # FIXME: this should be taken from hub / molecules
    source1_index = np.where(sources == source1)[0][0]
    molids_in_source = MRDK.source2molids(source)
    source2_index = np.where(sources == source)[0][0]
    # coocurrences
    target_in_test = scoocs[:, source1_index]
    impsrc_in_train = ~scoocs[:, source2_index]  # FIXME: this must be parameterizable
    expids = expids[target_in_test & impsrc_in_train]
    losses_mol1 = losses_df.loc[expids, molids_in_source]
    return losses_mol1.mean().mean()
def generate_df_results_source(molid, importances, dset, feats, model, calibration, lso):
    cooccurrences, sources, expids, folds = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    df_losses, folds_df = read_losses(dset=dset,
                                      feats=feats,
                                      model=model,
                                      calibration=calibration,
                                      lso=lso,
                                      also_folds=True)
    dico_for_df = defaultdict(dict)

    MRDK = ManysourcesDataset(dset).mols()
    for src in sources:
        if src == MRDK.molid2source(molid):
            continue
        dico_for_df[src] = {'importance': importances[np.where(sources == src)[0][0]],
                            'cooc_loss': average_loss_source(molid, src, cooccurrences, sources, expids, df_losses,
                                                             dset)}
    df = pd.DataFrame.from_dict(dico_for_df, orient='index')
    df.index.names = ['source']
    df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum()
    return df[['relabsimportance', 'importance', 'cooc_loss']]