def get_X_source(molid, expids, dset, feats, model, lso=True): """ Given a molid and an experiment coordinate, retrieves the matrix of cooccurrences for the folds when the source of that particular molid was in test """ MRDK = ManysourcesDataset(dset).mols() cooc, sources, _, _ = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) source_of_molid = MRDK.molid2source(molid) index_of_source = np.where(sources == source_of_molid)[0][0] col = cooc[:, index_of_source] # the column on which we put the condition interesting_Cooc = cooc[col] # the matrix X # filter out the rows where we had troubles validating the model X = interesting_Cooc[expids, :] X = np.array(X, dtype=np.int) return X
def average_loss_source(molid, source, scoocs, sources, expids, losses_df, dset): """ Molid is the target molid (the one for which we have built the model) source2 is the source for which we know its importance for the molid1 We want to compute the average loss of molid1 when source2 is in test """ MRDK = ManysourcesDataset(dset).mols() source1 = MRDK.molid2source(molid) # sources index # FIXME: this should be taken from hub / molecules source1_index = np.where(sources == source1)[0][0] molids_in_source = MRDK.source2molids(source) source2_index = np.where(sources == source)[0][0] # coocurrences target_in_test = scoocs[:, source1_index] impsrc_in_train = ~scoocs[:, source2_index] # FIXME: this must be parameterizable expids = expids[target_in_test & impsrc_in_train] losses_mol1 = losses_df.loc[expids, molids_in_source] return losses_mol1.mean().mean()
def generate_df_results_source(molid, importances, dset, feats, model, calibration, lso): cooccurrences, sources, expids, folds = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for src in sources: if src == MRDK.molid2source(molid): continue dico_for_df[src] = {'importance': importances[np.where(sources == src)[0][0]], 'cooc_loss': average_loss_source(molid, src, cooccurrences, sources, expids, df_losses, dset)} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['source'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'cooc_loss']]