def do_the_job(dset,
               feats,
               model,
               calibration=None,
               lso=True,
               regression_model=('linreg', LinearRegression),
               results_dir=op.join(MANYSOURCES_DATA_ROOT, 'results', 'loss_by_cooc'),
               n_jobs=None,
               by_source=False):
    rm_name, rm_factory = regression_model

    results_dir = op.join(results_dir,
                          'dset=%s' % dset,
                          'feats=%s' % feats,
                          'model=%s' % model,
                          'calibration=%s' % calibration,
                          'LSO=%r' % lso,
                          'reg_model=%s' % rm_name,
                          'bysource=%r' %by_source)
    ensure_dir(results_dir)

    _, molids, _, _ = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso)

    if n_jobs is None:
        n_jobs = cpu_count()

    Parallel(n_jobs=n_jobs)(delayed(do_for_one_molid)(calibration,
                                                      dset, feats, lso, model,
                                                      molid, results_dir, rm_factory, by_source)
                            for molid in sorted(molids))
Beispiel #2
0
    def rdkdescs(self, keep_with_missing=False):
        if self._rdkdescs is None:
            rdkdescs_file = op.join(manysources_dataset_root(self.name), '02-rdkdescs', '%s.rdkdescs.h5' % self.name)
            if not op.exists(rdkdescs_file):
                computer = RDKitDescripter()
                descs = computer.compute(self.mols().mols())
                molids = self.mols().molids()  # We assume all mols have descriptors computed, even if missing
                fnames = computer.fnames()
                ensure_dir(op.dirname(rdkdescs_file))
                with h5py.File(rdkdescs_file, 'w-') as h5:
                    h5['rdkdescs'] = descs
                    h5['molids'] = molids
                    h5['fnames'] = fnames
                with open(op.join(op.dirname(rdkdescs_file), 'config.json'), 'w') as writer:
                    writer.write(computer.what().id())
            with h5py.File(rdkdescs_file, 'r') as h5:
                self._rdkdescs = h5['rdkdescs'][:]
                self._rdkdescs_molids = h5['molids'][:]
                self._rdkdescs_fnames = h5['fnames'][:]
                if not keep_with_missing:
                    to_keep = np.all(np.isfinite(self._rdkdescs), axis=1)
                    self._rdkdescs = self._rdkdescs[to_keep, :]
                    self._rdkdescs_molids = self._rdkdescs_molids[to_keep]

        return self._rdkdescs_molids, \
            self._rdkdescs_fnames, \
            self._rdkdescs, \
            self.Y(self._rdkdescs_molids)
Beispiel #3
0
 def mols(self):
     if self._molecules is None:
         pickle_file = op.join(manysources_dataset_root(self.name), '01-molspickle', '%s.pickled.gz' % self.name)
         if not op.isfile(pickle_file):
             self._molecules = MANYSOURCES_MOLECULES.get(self.name, lambda: None)()
             ensure_dir(op.dirname(pickle_file))
             self._molecules.to_pickle(pickle_file, compressed=True)
             with open(op.join(op.dirname(pickle_file), 'config.json'), 'w') as writer:
                 writer.write(self._molecules.what().id())
         else:
             self._molecules = MoleculesFromRDKit.from_pickle(pickle_file, compressed=True)
     return self._molecules
Beispiel #4
0
 def _unfolded_ecfps(self):
     if self._ecfps is None:
         ecfp_file = op.join(manysources_dataset_root(self.name), '02-ecfps', '%s.ecfps.h5' % self.name)
         if not op.exists(ecfp_file):
             fingerprinter = RDKMorganFingerprinter()
             for molid, mol in self.mols():
                 print molid
                 fingerprinter.add_mol(molid, mol)
             ensure_dir(op.dirname(ecfp_file))
             with open(op.join(op.dirname(ecfp_file), 'config.json'), 'w') as writer:
                 writer.write(fingerprinter.what().id())
             fingerprinter.fingerprints().save(ecfp_file)
         self._ecfps = UnfoldedFingerprints.load(ecfp_file)
     return self._ecfps
Beispiel #5
0
 def _unfolded_ecfps_nodupes(self):
     if self._ecfps_no_dupes is None:
         ecfp_nodupes_file = op.join(manysources_dataset_root(self.name),
                                     '03-ecfps-nodupes', '%s.ecfps.h5' % self.name)
         if not op.exists(ecfp_nodupes_file):
             ecfps = self._unfolded_ecfps()
             ufp = UnfoldedFingerprints(ecfps.molids,
                                        ecfps.i2s,
                                        zero_dupes(ecfps.csr, by_rows=False),  # Hack 1, this should be Configurable
                                        failed_molids=ecfps.failed_molids)
             ensure_dir(op.dirname(ecfp_nodupes_file))
             with open(op.join(op.dirname(ecfp_nodupes_file), 'config.txt'), 'w') as writer:
                 writer.write('Same as 02-ecfps, but removed columns that have the same value accross all rows.')
             ufp.save(ecfp_nodupes_file)
         self._ecfps = UnfoldedFingerprints.load(ecfp_nodupes_file)
     return self._ecfps
Beispiel #6
0
def split_sdf(input_sdf, output_dir, prefix, nb_mols=10000):

    mol_iterator = iterate_records_in_text_file(input_sdf)
    ensure_dir(output_dir)

    def write_n_mols(fn):
        mols_counter = 0
        with open(fn, "w") as writer:
            for mol in mol_iterator:
                writer.write(mol)
                writer.write("\n")
                mols_counter += 1
                if mols_counter == nb_mols:
                    return True
        return False

    def fn(fnum, pad=None):
        if not pad:
            return op.join(output_dir, "%s_%d.sdf" % (prefix, fnum))
        else:
            return op.join(output_dir, prefix + "_" + pad + str(fnum) + ".sdf")

    file_counter = 1
    while True:
        if not write_n_mols(fn(file_counter)):
            break
        file_counter += 1

    # check if last file is empty (not elegant but easy for me)
    last_file = op.join(output_dir, "%s_%d.sdf" % (prefix, file_counter))
    if op.getsize(last_file) == 0:
        os.remove(last_file)
        file_counter -= 1

    # rename the files with 0 padding so that we keep numerical order
    padding = len(str(file_counter))
    files_to_rename = glob.glob(op.join(output_dir, "*.sdf"))
    for f in files_to_rename:
        file_nb = f.split("_")[-1]
        file_nb = file_nb.split(".")[0]
        pad = "0" * (padding - len(str(file_nb)))
        if len(pad) > 0:
            os.rename(f, fn(file_nb, pad=pad))
Beispiel #7
0
    def mean_loss_matrix(self, mols_in_rows=True, rows_in_train=False):
        """
        Returns two dataframes: mean loss and number of occurrences.

        These dataframes:
          - have mols or sources in rows (depending of *mols_in_rows*)
          - have mols in columns
          - each entry contains the mean loss (and count) when
            * the mol in col is in test
            * whatever in the row is in train or test (depending on *rows_in_train*)

        They would look like this:

        |-----------------------------------------|
        |     index       |        data           |
        |-----------------|-----------------------|
        | source or molid | molid1 | molid2 | ... |
        |-----------------------------------------|
        |   molid1        |  0.83  |  0.02  | ... |
        |   molid2        |  0.17  |  0.01  | ... |
        | ...             |  ...   |  ...   | ... |
        |-----------------------------------------|

        |-----------------------------------------|
        |     index       |        data           |
        |-----------------|-----------------------|
        | source or molid | molid1 | molid2 | ... |
        |-----------------------------------------|
        |   molid1        |  794   |  733   | ... |
        |   molid2        |  680   |  667   | ... |
        | ...             |  ...   |  ...   | ... |
        |-----------------------------------------|

        :rtype: (pandas.DataFrame, pandas.DataFrame)
        """

        matid = 'mir=%r#rit=%r' % (mols_in_rows, rows_in_train)
        cache_dir = op.join(MANYSOURCES_DATA_ROOT, 'results', 'mlms', self.what().id(maxlength=1))
        ensure_dir(cache_dir)
        cache_file = op.join(cache_dir, matid + '.pkl')

        if not op.isfile(cache_file):

            # row masks
            rows_df = self.mcoocs() if mols_in_rows else self.scoocs()
            rows_df = ~rows_df if rows_in_train else rows_df
            # col masks
            cols_df = self.mcoocs()
            # losses
            loss_df = self.squared_losses().T

            # sanity checks for columns
            assert np.all(cols_df.columns == loss_df.index)

            @jit(nopython=True)
            def update_cooc_cooc(res_matrix, norm_matrix, losses, rows, cols):
                for row in rows:
                    for col in cols:
                        res_matrix[row, col] += losses[col]
                        norm_matrix[row, col] += 1

            loss_matrix = np.zeros((len(rows_df.columns), len(cols_df.columns)))
            norm_matrix = np.zeros((len(rows_df.columns), len(cols_df.columns)))

            for expid in loss_df.columns:
                print 'Computing mean loss for expid=%d' % expid
                if expid not in rows_df.index.levels[0]:
                    print '\t Cannot find coocurrences, skipping...'
                    continue
                exp_losses = loss_df[expid]
                #
                # FIXME: this does not seem right now that we have a multilevel df
                #
                for rowmask, colmask in izip(rows_df.loc[expid].values,
                                             cols_df.loc[expid].values):
                    update_cooc_cooc(loss_matrix,
                                     norm_matrix,
                                     exp_losses.values,
                                     np.where(rowmask)[0],
                                     np.where(colmask)[0])

            losses_df = pd.DataFrame(loss_matrix / norm_matrix,
                                     index=sorted(rows_df.columns),
                                     columns=cols_df.columns)

            counts_df = pd.DataFrame(norm_matrix,
                                     index=sorted(rows_df.columns),
                                     columns=cols_df.columns)

            pd.to_pickle((losses_df, counts_df), cache_file)

        return pd.read_pickle(cache_file)
def do_for_one_molid(calibration, dset, feats, lso, model, molid, results_dir, rm_factory, by_source=False):
    print molid
    MRDK = ManysourcesDataset(dset).mols()  # FIXME: this is read on each job, so once per molecule ATM...
    # Train and evaluate the model
    y, expids = get_y(molid, dset, feats, model, calibration, lso)
    if not by_source:
        X = get_X(molid, expids, dset, feats, model, lso)
    else:
        X = get_X_source(molid, expids, dset, feats, model) # makes no sense to run by source on LSO=False
    X = ~X  # coocurrences in train, less sparse, but better interpretation unless we tweak well the numbers...
    rsquared, feat_weights, trained_model = build_and_validate_regression_model(X, y, model_factory=rm_factory)
    rsquared = float(rsquared)
    # REMOVE moldir shows r2
    moldir = op.join(results_dir, 'r2=%.2f__%s' % (rsquared, molid))
    ensure_dir(moldir)
    # Save the model
    pd.to_pickle(trained_model, op.join(moldir, 'model_trained_rsquare=%.2f.pkl' % rsquared))
    # Save the smiles
    smiles = MolToSmiles(MRDK.molid2mol(molid))
    with open(op.join(moldir, 'smiles.txt'), 'w') as writer:
        writer.write(smiles)
    # Save the molecule-influence table
    if not by_source:
        df = generate_df_results(molid, feat_weights, dset, feats, model, calibration, lso)
        pd.to_pickle(df, op.join(moldir, 'results_df.pkl'))
        df.loc[molid] = (1E16, rsquared, smiles, np.mean(y))  # FIXME
        df['label'] = map(MRDK.molid2label, df.index)
        df = df[['label', 'relabsimportance', 'importance', 'smiles', 'cooc_loss']]
        df = df.sort('relabsimportance', ascending=False)
        df.head(20).to_html(op.join(moldir, 'results_df.html'))
    else:
        df = generate_df_results_source(molid, feat_weights, dset, feats, model, calibration, lso)
        pd.to_pickle(df, op.join(moldir, 'results_df_bysource.pkl'))
        df = df.sort('relabsimportance', ascending=False)
        df.head(20).to_html(op.join(moldir, 'results_df_bysource.html'))
    # Plot the distribution of losses (y)
    plt.figure()
    seaborn.distplot(y, bins=40)
    plt.xlim((-0.05, 1.05))
    plt.title('molid=%s, r2=%.2f' % (molid, rsquared))
    plt.savefig(op.join(moldir, 'y_dist.png'), bbox_inches='tight')
    plt.close()
    # --- WIP gridspec with chemdeco pics and things like that
    if not by_source:
        show_top = 4
        gs = gridspec.GridSpec(show_top, 2)
        fig = plt.figure(figsize=(24, 16))
        # Plot the molecule itself
        ax_mol = fig.add_subplot(gs[0:show_top / 2, 0])
        ax_mol.grid(False)
        ax_mol.get_xaxis().set_ticks([])
        ax_mol.get_yaxis().set_ticks([])
        mol = MRDK.molid2mol(molid)
        AllChem.Compute2DCoords(mol)
        ax_mol.imshow(artdeco2(rdkit2im(mol, size=(400, 400)), color='red' if df.loc[molid]['label'] == 'INHIBITOR' else 'green', chorrada=5))
        # Plot the distribution of losses
        ax_distr = fig.add_subplot(gs[show_top / 2:0, 0])
        seaborn.distplot(y, bins=40, ax=ax_distr)
        # Plot the top (we should align all to a common scaffold and maybe highlight substructures that matter)
        for rank, (inf_molid, row) in enumerate(df.iloc[1:show_top + 1].iterrows()):
            ax_influential_mol = fig.add_subplot(gs[rank, 1])
            ax_influential_mol.grid(False)
            ax_influential_mol.get_xaxis().set_ticks([])
            ax_influential_mol.get_yaxis().set_ticks([])
            mol_color = 'red' if row['label'] == 'INHIBITOR' else 'green'
            good_or_bad_color = 'red' if row['importance'] > 0 else 'green'
            # add decos
            mol = MRDK.molid2mol(inf_molid)
            AllChem.Compute2DCoords(mol)
            image = rdkit2im(mol)
            image = artdeco1(image, decos=(('black', good_or_bad_color),))
            image = artdeco2(image, color=mol_color)
            ax_influential_mol.imshow(image)
            ax_influential_mol.set_title('%s, inf=%.4f, cooc_loss=%.4f' %
                                         (inf_molid, row['importance'], row['cooc_loss']))
            # FIXME: cooc_loss also with stddev and standard error

        fig.suptitle('%s, r2=%.2f, cooc_loss=%.4f +/- %.4f' % (molid, rsquared, float(np.mean(y)), float(np.std(y))))
        plt.savefig(op.join(moldir, 'verde_que_te_quiero_verde.png'), bbox_inches='tight')
        plt.close()
Beispiel #9
0
def generate_lsocv_results(dset='bcrp',
                           model='logreg1',
                           feats='ecfps1',
                           expids=(0,),
                           compression='lzf',
                           reraise=False):
    """Runs train/test experiments for the "manysources" experiments, saving the results in disk.

    Each individual experiment is saved to an HDF5 file. Afterwards they can be merged.

    The generated HDF5 files have the following structure:
      /dsets
        /bcrp{config}
      /models
        /logreg1{config}
      /featss
        /ecfps1{config}
      /dset='bcrp'
        /feats='ecps1'
          /ids=[molid1, molid2, ..., molidn]
          /expid=3
            /lsocv{config, num_folds}
              /fold0'{config}
                /ext_indices
                /ext_scores
                /y_ext
                /DONE or FAILED
                /model='logreg3'{config, train_time, test_time, auc}
                  /model_data1
                  /model_data2
                  /...
              /fold=1...
              /...
            /crscv#seed=3{config}
                /*same as lsocv*

    Parameters
    ----------
    dset: string, default 'bcrp'
        the name of the dataset
    model: string, default 'logreg1'
        the name of the model configuration
    feats: string, default 'ecfp1'
        the name of the molecular features
    expids: iterable of ints, default (1,)
        the experiment ids to carry
    compression: string, default 'lzf'
        the compression used to store arrays in the HDF5 file; if None, no compression will be used
    reraise: boolean, default False
        if True, exceptions are raised; if not, they are ignored and results keep being generated

    Returns
    -------
    Nothing, but prints a cool DONE and saves the results in disk.
    """

    def train_test(model, X, y, train_indices, trest_indices):
        # Split
        Xtrain, ytrain, Xtrest, ytrest = \
            X[train_indices, :], y[train_indices], \
            X[trest_indices, :], y[trest_indices]
        # Train
        model = copy(model)
        start = time()
        model.fit(Xtrain, ytrain)
        train_time = time() - start
        # Test
        start = time()
        scores = model.predict_proba(Xtrest)[:, 1]
        test_time = time() - start
        return scores, model, train_time, test_time

    def generate_split_result(model_config, X, y, split_id, splitter, cvgroup, reraise=False):
        # Splitting
        train_indices, trest_indices = splitter.split()
        fold_group = cvgroup.require_group(split_id)
        try:
            fold_group.attrs['config'] = splitter.what().id()
            fold_group.create_dataset('test_indices', data=trest_indices, compression=compression)  # uncompressible
            fold_group.create_dataset('y_test', data=y[trest_indices], compression=compression)
        except:
            pass  # Dodgy
        # Model configuration
        model_group = fold_group.require_group('model=%s' % model_config.nickname)
        try:
            # already done?
            if 'DONE' in fold_group.keys():
                print '%s already done, skipping...' % model_group.name
                return
            if 'FAILED' in fold_group.keys():
                print '%s already failed, skipping...' % model_group.name
                return
            # compute the result
            scores, model, train_time, test_time = \
                train_test(model_config.seed_model(expid), X, y, train_indices, trest_indices)
            # save scores, auc, times
            try:
                model_group.attrs['auc'] = roc_auc_score(y[trest_indices], scores)
            except:
                model_group.attrs['auc'] = None
            model_group.attrs['train_time'] = train_time
            model_group.attrs['test_time'] = test_time
            model_group.create_dataset('test_scores', data=scores, compression=compression)
            # save whatever from the model
            model_config.storer.to_hdf5(model, model_group, compression=compression)
            # done
            model_group['DONE'] = 'Finished on %s' % strftime("%c")
        except Exception:
            model_group['FAILED'] = format_exc()
            if reraise:
                raise

    # Dataset hub
    dset = dset if isinstance(dset, ManysourcesDataset) else ManysourcesDataset(dset)
    # Features configuration
    feats_config = MANYSOURCES_FEATS[feats]
    molids, X, y = feats_config.extractor(dset)
    # Model configuration
    model_config = MANYSOURCES_MODELS[model]

    for expid in expids:
        print 'expid=%d' % expid
        lsocv = setup_splitters_lsocv(dset=dset, molids=molids, expids=(expid,))[0]
        h5_file = single_hdf5_per_exp(expid=expid, dset=dset.name, model=model, feats=feats)
        ensure_dir(op.dirname(h5_file))
        with h5py.File(h5_file) as h5:
            try:
                # coordinates
                try:
                    h5.require_group('dsets/%s' % dset.name).attrs['config'] = dset.name
                    h5.require_group('featss/%s' % feats).attrs['config'] = feats_config.name
                    h5.require_group('models/%s' % model).attrs['config'] = model_config_string(model_config.model)
                except:
                    pass  # Dodgy
                # dset coordinates
                dset_group = h5.require_group('/dset=%s' % dset.name)
                # features coordinates
                feat_group = dset_group.require_group('feats=%s' % feats)
                try:
                    feat_group.create_dataset('ids', data=molids, compression='lzf')
                except:
                    pass  # Data already there
                # expid group
                expid_group = feat_group.require_group('expid=%d' % expid)
                # LSO-CV
                lsocv_group = expid_group.require_group('lsocv')
                lsocv_group.attrs['config'] = lsocv.what().id()
                lsocv_group.attrs['num_folds'] = lsocv.num_folds
                lsocv_group.attrs['seed'] = lsocv.seed
                for split_num, splitter in enumerate(lsocv.splitters()):
                    generate_split_result(model_config, X, y,
                                          'fold=%d' % split_num, splitter, lsocv_group,
                                          reraise=reraise)
                # CRS-CV
                crscv = crscv_from_lsocv(lsocv)
                crscv_group = lsocv_group.require_group('crscv#seed=%d' % crscv.seed)
                crscv_group.attrs['config'] = crscv.what().id()
                crscv_group.attrs['num_folds'] = len(crscv.fold_sizes)
                crscv_group.attrs['seed'] = crscv.seed
                for split_num, splitter in enumerate(crscv.splitters()):
                    generate_split_result(model_config, X, y,
                                          'fold=%d' % split_num, splitter,
                                          crscv_group, reraise=reraise)
            except:
                if 'TOP_FAIL' not in h5:
                    h5['TOP_FAIL'] = format_exc()
                if reraise:
                    raise

    print 'DONE'