def generate_df_results(molid, importances, dset, feats, model, calibration, lso): cooccurrences, molids, expids, folds = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for other_molid in molids: if other_molid == molid: continue dico_for_df[other_molid] = {'importance': importances[np.where(molids == other_molid)[0][0]], 'cooc_loss': average_loss(molid, other_molid, cooccurrences, molids, expids, df_losses), 'smiles': MolToSmiles(MRDK.molid2mol(other_molid))} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['molid'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'smiles', 'cooc_loss']]
def do_the_job(dset, feats, model, calibration=None, lso=True, regression_model=('linreg', LinearRegression), results_dir=op.join(MANYSOURCES_DATA_ROOT, 'results', 'loss_by_cooc'), n_jobs=None, by_source=False): rm_name, rm_factory = regression_model results_dir = op.join(results_dir, 'dset=%s' % dset, 'feats=%s' % feats, 'model=%s' % model, 'calibration=%s' % calibration, 'LSO=%r' % lso, 'reg_model=%s' % rm_name, 'bysource=%r' %by_source) ensure_dir(results_dir) _, molids, _, _ = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) if n_jobs is None: n_jobs = cpu_count() Parallel(n_jobs=n_jobs)(delayed(do_for_one_molid)(calibration, dset, feats, lso, model, molid, results_dir, rm_factory, by_source) for molid in sorted(molids))
def get_X(molid, expids, dset, feats, model, lso=True): """ Given a molid and an experiment coordinate, retrieves the matrix of cooccurrences for the folds when the molid was in test """ cooc, molids, _, _ = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) index_of_molid = np.where(molids == molid)[0][0] col = cooc[:, index_of_molid] # the column on which we put the condition interesting_Cooc = cooc[col] # the matrix X # filter out the rows where we had troubles validating the model X = interesting_Cooc[expids, :] X = np.array(X, dtype=np.int) return X
def get_xy(dset, feats, model, lso, y_df): from manysources.analyses.cooccurrences import molecules_coocurrences_df # get cooccurrences of compounds, along with the corresponding expids and fold ids as lists cooc, molids, expids, folds = molecules_coocurrences_df(dset, feats=feats, model=model, lso=lso) #print coocurrences #print expids #print folds cooccurrences_dict = defaultdict(list) for i in range(len(cooc)): cooccurrences_dict[(expids[i], folds[i])] = cooc[i] expids_in_y = y_df['expid'] folds_in_y = y_df['foldid'] y = np.array(y_df['class1']) X = [] [X.append(cooccurrences_dict[(expid,foldid)]) for expid,foldid in zip(expids_in_y, folds_in_y)] X = np.array(X, dtype=np.int) return X, y
def mcoocs(self): """ Returns a multilevel-indexed dataframe of molecules coocurrences in test for each partition train/test. The dataframe returned by this function has - a sorted index with two levels (expid, fold) - a sorted column index, one column per molecule - boolean values It would look like this |----------------------------------------| | index | data | |-----------------|----------------------| | expid | foldid | mol1 | mol2 | ... | |----------------------------------------| | 0 | 0 | False | False | ... | | 1 | 0 | True | False | ... | | ... | ... | ... | ... | ... | |----------------------------------------| :rtype: pandas.DataFrame """ mcoocs, molids, expids, folds = molecules_coocurrences_df( expids=self.expids, dset=self.dset_id, feats=self.feats, model=self.model, lso=self.lso ) index = MultiIndex.from_arrays(arrays=(expids, folds)) index.levels[0].name = 'expid' index.levels[1].name = 'fold' mcooc_df = pd.DataFrame(data=mcoocs, index=index, columns=molids) return mcooc_df.sort_index(axis=0).sort_index(axis=1)