def generate_df_results(molid, importances, dset, feats, model, calibration, lso):
    cooccurrences, molids, expids, folds = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    df_losses, folds_df = read_losses(dset=dset,
                                      feats=feats,
                                      model=model,
                                      calibration=calibration,
                                      lso=lso,
                                      also_folds=True)
    dico_for_df = defaultdict(dict)

    MRDK = ManysourcesDataset(dset).mols()
    for other_molid in molids:
        if other_molid == molid:
            continue
        dico_for_df[other_molid] = {'importance': importances[np.where(molids == other_molid)[0][0]],
                                    'cooc_loss': average_loss(molid,
                                                              other_molid,
                                                              cooccurrences,
                                                              molids,
                                                              expids,
                                                              df_losses),
                                    'smiles': MolToSmiles(MRDK.molid2mol(other_molid))}
    df = pd.DataFrame.from_dict(dico_for_df, orient='index')
    df.index.names = ['molid']
    df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum()
    return df[['relabsimportance', 'importance', 'smiles', 'cooc_loss']]
def do_the_job(dset,
               feats,
               model,
               calibration=None,
               lso=True,
               regression_model=('linreg', LinearRegression),
               results_dir=op.join(MANYSOURCES_DATA_ROOT, 'results', 'loss_by_cooc'),
               n_jobs=None,
               by_source=False):
    rm_name, rm_factory = regression_model

    results_dir = op.join(results_dir,
                          'dset=%s' % dset,
                          'feats=%s' % feats,
                          'model=%s' % model,
                          'calibration=%s' % calibration,
                          'LSO=%r' % lso,
                          'reg_model=%s' % rm_name,
                          'bysource=%r' %by_source)
    ensure_dir(results_dir)

    _, molids, _, _ = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)

    if n_jobs is None:
        n_jobs = cpu_count()

    Parallel(n_jobs=n_jobs)(delayed(do_for_one_molid)(calibration,
                                                      dset, feats, lso, model,
                                                      molid, results_dir, rm_factory, by_source)
                            for molid in sorted(molids))
def get_X(molid, expids, dset, feats, model, lso=True):
    """
    Given a molid and an experiment coordinate, retrieves the matrix of cooccurrences for the folds when the molid
    was in test
    """
    cooc, molids, _, _ = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)
    index_of_molid = np.where(molids == molid)[0][0]
    col = cooc[:, index_of_molid]  # the column on which we put the condition
    interesting_Cooc = cooc[col]  # the matrix X
    # filter out the rows where we had troubles validating the model
    X = interesting_Cooc[expids, :]
    X = np.array(X, dtype=np.int)
    return X
def get_xy(dset, feats, model, lso, y_df):
    from manysources.analyses.cooccurrences import molecules_coocurrences_df
    # get cooccurrences of compounds, along with the corresponding expids and fold ids as lists
    cooc, molids, expids, folds = molecules_coocurrences_df(dset, feats=feats, model=model, lso=lso)
    #print coocurrences
    #print expids
    #print folds
    cooccurrences_dict = defaultdict(list)
    for i in range(len(cooc)):
        cooccurrences_dict[(expids[i], folds[i])] = cooc[i]

    expids_in_y = y_df['expid']
    folds_in_y = y_df['foldid']
    y = np.array(y_df['class1'])
    X = []
    [X.append(cooccurrences_dict[(expid,foldid)]) for expid,foldid in zip(expids_in_y, folds_in_y)]
    X = np.array(X, dtype=np.int)
    return X, y
Beispiel #5
0
    def mcoocs(self):
        """
        Returns a multilevel-indexed dataframe of molecules coocurrences in test for each partition train/test.

        The dataframe returned by this function has
          - a sorted index with two levels (expid, fold)
          - a sorted column index, one column per molecule
          - boolean values
        It would look like this

        |----------------------------------------|
        |     index       |        data          |
        |-----------------|----------------------|
        | expid  | foldid |  mol1  | mol2  | ... |
        |----------------------------------------|
        |   0    |   0    |  False | False | ... |
        |   1    |   0    |  True  | False | ... |
        | ...    |  ...   |  ...   |  ...  | ... |
        |----------------------------------------|

        :rtype: pandas.DataFrame
        """
        mcoocs, molids, expids, folds = molecules_coocurrences_df(
            expids=self.expids,
            dset=self.dset_id,
            feats=self.feats,
            model=self.model,
            lso=self.lso
        )
        index = MultiIndex.from_arrays(arrays=(expids, folds))
        index.levels[0].name = 'expid'
        index.levels[1].name = 'fold'
        mcooc_df = pd.DataFrame(data=mcoocs,
                                index=index,
                                columns=molids)

        return mcooc_df.sort_index(axis=0).sort_index(axis=1)