def experiment_no_folding_01(compute_scores_unlabelled=True, compute_scores_screening=False, cv=True,
                             regularization='l1', c=1.0, keep_ambiguous=False,
                             dest_dir=op.join(MALARIA_EXPS_ROOT, 'folding_rdkit')):
    # We train the same models but without folding, and not using the counts but just 0 and 1
    dest_dir = op.join(dest_dir, 'no_folding')
    X, y, classifier, _ = start_exp(dest_dir, regularization, c, keep_ambiguous)
    X.data = np.ones(X.data.shape)
    Xunl = None
    Xscr = None
    if compute_scores_unlabelled:
        mfpunl = MalariaFingerprintsManager(dset='unl')
        Xunl = mfpunl.X()
        Xunl = csr_matrix((np.ones(Xunl.data.shape), Xunl.indices, Xunl.indptr), shape=(Xunl.shape[0], X.shape[1]))
    if compute_scores_screening:
        mfscr = MalariaFingerprintsManager(dset='scr')
        Xscr = mfscr.X()
        Xscr = csr_matrix((np.ones(Xscr.data.shape), Xscr.indices, Xscr.indptr), shape=(Xscr.shape[0], X.shape[1]))
    if cv:
        print(X.shape, Xunl.shape)
        print('Cross-validating the model...')
        print(run_cv(dest_dir, X, y, classifier, Xunl, Xscr))

    # Train the full model and predict if necessary the unlabelled and screening sets
    global_dir = op.join(dest_dir, 'full_model')
    ensure_dir(global_dir)
    print('Training the global classifier...')
    print(run_full_model(global_dir, X, y, classifier, Xunl, Xscr))
def master_experiment(fold_sizes=513, compute_scores_unlabelled=True, compute_scores_screening=False, cv=True,
                      regularization='l1', c=1.0, keep_ambiguous=False,
                      dest_dir=op.join(MALARIA_EXPS_ROOT, 'folding_rdkit')):
    """
    Here we compute a scikit learn Logistic Regression for different folding sizes of the rdkit ecfp fingerprints.
    If asked, we do a 10-fold cross-validation and also apply the built model to the screening and/or unlabelled sets.
    """
    dest_dir = op.join(dest_dir, 'fs=%i' % fold_sizes)
    print('Starting the experiment...')
    _, y, classifier, labelled = start_exp(dest_dir, regularization, c, keep_ambiguous)
    X = get_folded_fpt_sparse('lab', folding_size=fold_sizes)
    X.data = np.ones(X.data.shape)
    X = X[labelled, :]
    print(X.shape)
    print('Got the training set.')
    Xunl = None
    Xscr = None
    if compute_scores_unlabelled:
        Xunl = get_folded_fpt_sparse('unl', folding_size=fold_sizes)
        print('Got the unlabelled set.')
        print(Xunl.shape)
    if compute_scores_screening:
        Xscr = get_folded_fpt_sparse('scr', folding_size=fold_sizes)
    if cv:
        print('Cross-validating the model...')
        print(run_cv(dest_dir, X, y, classifier, Xunl, Xscr))

    # Train the full model and predict if necessary the unlabelled and screening sets
    global_dir = op.join(dest_dir, 'full_model')
    ensure_dir(global_dir)
    print('Training the global classifier...')
    print(run_full_model(global_dir, X, y, classifier, Xunl, Xscr))
def run_cv(dest_dir, X, y, classifier, Xunl, Xscr):
    folds = give_cross_val_folds(y, 10)
    aucs = []
    for i, fold in enumerate(folds):
        fold_dir = op.join(dest_dir, 'fold=%i' % i)
        ensure_dir(fold_dir)
        scores_unl = None
        scores_scr = None
        train_indices = np.array([j for j in range(len(y)) if j not in fold])
        yte = y[fold]
        ytr = y[train_indices]
        Xte = X[fold, :]
        Xtr = X[train_indices, :]
        print('Training the classifier...')
        classifier.fit(Xtr, ytr)
        scores = classifier.predict_proba(Xte)[:, 1]
        if Xunl is not None:
            print('Scoring the unlabelled dataset...')
            scores_unl = classifier.predict_proba(Xunl)
        if Xscr is not None:
            print('Scoring the screening dataset...')
            scores_scr = classifier.predict_proba(Xscr)
        auc = roc_auc_score(yte, scores)
        aucs.append(auc)
        print('AUC for fold %i: %.2f' % (i, auc))
        print('********************')
        result = [classifier, scores, fold, auc, scores_unl, scores_scr]
        with open(op.join(fold_dir, 'results.pkl'), 'w') as writer:
            pickle.dump(result, writer)
    # noinspection PyStringFormat
    print('Average AUC: %.2f +/- %.2f' % (np.mean(np.array(aucs)), np.std(np.array(aucs))))
    print('********************')
Example #4
0
 def __init__(self, molid2i, root, prefix, data2molid, chunksize=10000):
     super(Chihuahua, self).__init__()
     self.chunksize = chunksize
     self.molid2i = molid2i
     self.num_mols = len(self.molid2i)
     self.temp_fns = [op.join(root, '%s-%d' % (prefix, base)) for base in xrange(0, self.num_mols, chunksize)]
     self.temp_files = [open(fn, 'w') for fn in self.temp_fns]
     self.data2molid = data2molid
     self.root = root
     self.prefix = prefix
     ensure_dir(self.root)
def start_exp(dest_dir, regularization, c, keep_ambiguous):
    """
    Define the classifier. Get the non-folded data.
    """
    ensure_dir(dest_dir)
    classifier = LogisticRegression(penalty=regularization, C=c, class_weight='auto')
    mfp = MalariaFingerprintsManager(dset='lab')
    X, y = mfp.Xy()
    if not keep_ambiguous:
        labelled = ~np.isnan(y)
        X = X[labelled, :]
        y = y[labelled]
    else:
        labelled = np.ones(len(y))
    return X, y, classifier, labelled
Example #6
0
def ecfps_mp(numjobs=None, dest_dir=None):
    """Python-parallel computation of ECFPs.
    Parameters:
      - numjobs: the number of threads to use (None=all in the machine).
      - dest_dir: the directory to which the fingerprints will be written, in weird fp format(TM).
    """
    dest_dir = _MALARIA_ECFPS_PARALLEL_RESULTS_DIR if dest_dir is None else dest_dir
    ensure_dir(dest_dir)
    numjobs = cpu_count() if numjobs is None else int(numjobs)
    Parallel(n_jobs=numjobs)(delayed(_molidsmiles_it_ecfp)
                             (start=start,
                              step=numjobs,
                              output_file=op.join(dest_dir, 'all__fcfp=%r__start=%d__step=%d.weirdfps' %
                                                            (fcfp, start, numjobs)),
                              fcfp=fcfp)
                             for start, fcfp in product(range(numjobs), (True, False)))
Example #7
0
def ecfps(start=0, step=46, mols='lab', output_file=None, fcfp=True):
    """Entry point for the command line to generate fingerprints.
    Parameters:
      - start: the index of the first molecule to consider
      - step: how many molecules are skipped in each iteration
      - mols: an iterator over pairs (molid, smiles) or a string
              ('lab'|'unl'|'scr'|'all') to use one of TDT malaria's iterators
      - fcfp: generate FCFPs or ECFPs
      - output_file: the file to which the fingerprints will be written, in
                     weird fp format(TM).
    """
    if isinstance(mols, basestring):
        mols = MOLS2MOLS[mols]()
    ensure_dir(op.dirname(output_file))
    _molidsmiles_it(start=start, step=step,
                    mols=mols,
                    processor=_ecfp_writer(output_file=output_file, fcfp=fcfp))
Example #8
0
def _rdkfeats_writer(output_file=None, features=None):
    """Returns a (molindex, molid, smiles) processor that computes descriptors using RDKit and stores then in a h5 file.

    Parameters:
      - output_file: where the descriptors will be written; this file will be overwritten.
      - features: a list of the names of the RDKit features that will be computed
                  (by default all the descriptors exposed by the Descriptor class in RDKit)

    Returns:
      - a processor function ready to be used as a parameter to _molidsmiles_it.

    The h5 file has the following data:
      - 'rdkdescs': a float matrix num_mols x num_descs
                    this will all be nans if the computation failed completely
      - 'fnames': the name of the feature in each column (num_cols)
      - 'molids': the molid corresponding to each row (num_rows)
    """
    ensure_dir(op.dirname(output_file))
    h5 = h5py.File(output_file, mode='w', dtype=np.float32)
    computer = RDKitDescriptorsComputer(features)
    fnames = computer.fnames()
    nf = len(fnames)
    descs = h5.create_dataset('rdkdescs', (0, nf), maxshape=(None, nf), compression='lzf')
    str_type = h5py.new_vlen(str)
    h5.create_dataset('fnames', data=fnames)
    molids = h5.create_dataset('molids', shape=(0,), maxshape=(None,), dtype=str_type)

    def process(molid, smiles):
        if molid is _END_MOLID:
            h5.close()
            return
        ne = len(molids)
        try:
            molids.resize((ne + 1,))
            molids[ne] = molid
            mol = to_rdkit_mol(smiles)
            descs.resize((ne + 1, nf))
            descs[ne, :] = computer.compute(mol)[0]
        except:
            info('Failed molecule %s: %s' % (molid, smiles))
            descs[ne, :] = [np.nan] * nf

    return process
Example #9
0
    def __init__(self, root_dir):
        """Quick random access to collections of molecules in disk, using molids.
        Caveat: great for random access, not suitable for streaming purposes.
                All read molecules stay in memory until (all) handles to this memmap are closed.
        """
        # Where the index resides...
        self._root = root_dir
        ensure_dir(self._root)

        # Index {molid -> (start, numbytes)}
        self._molids_file = op.join(self._root, 'molids.txt')
        self._coords_file = op.join(self._root, 'coords.npy')
        self._molids = None
        self._coords = None
        self._molid2coords = None

        # The serialized molecules
        self._data_file = op.join(self._root, 'molsdata')
        self._filehandle = None
        self._molsdata = None
Example #10
0
"""
import logging
import os.path as op
from minioscail.common.misc import ensure_dir

__version__ = '0.2-dev0'

# --- Paths and other constants.

# Make everything relative to the source location...
_THIS_PATH = op.abspath(op.dirname(__file__))  # maybe jump to pkgutils?
# Where the data resides
MALARIA_DATA_ROOT = op.abspath(op.join(_THIS_PATH, '..', '..', 'data'))
# The original downloaded files will come here
MALARIA_ORIGINAL_DATA_ROOT = op.join(MALARIA_DATA_ROOT, 'original')
ensure_dir(MALARIA_ORIGINAL_DATA_ROOT)
# Different indices (like molid -> smiles) come here
MALARIA_INDICES_ROOT = op.join(MALARIA_DATA_ROOT, 'indices')
ensure_dir(MALARIA_INDICES_ROOT)
# Experiment results come here
MALARIA_EXPS_ROOT = op.join(MALARIA_DATA_ROOT, 'experiments')
ensure_dir(MALARIA_EXPS_ROOT)

# --- Common logger for the malaria code.

_logger = logging.getLogger('malaria')
_logger.setLevel(logging.DEBUG)
debug = _logger.debug
info = _logger.info
warning = _logger.warning
error = _logger.error
Example #11
0
def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT,
                # Logreg params
                logreg_penalty='l1',
                logreg_C=1.0,
                logreg_class_weight_auto=False,
                logreg_dual=False,
                logreg_tol=1e-4,
                logreg_fit_intercept=True,
                logreg_intercept_scaling=1,
                # CV params
                num_cv_folds=10,
                cv_seeds=(0,),
                save_unlabelled_predictions=False,
                save_fold_model=False,
                min_fold_auc=0.88,
                # Fingerprint folding params
                fingerprint_folder_seed=0,
                fingerprint_fold_size=1023,
                # Computational requirements params
                force=False,
                chunksize=1000000):
    """Logistic regression experiment using the liblinear wrapper in sklearn.
    Generates cross-val results
    """

    ### TODO Remove
    if logreg_tol < 1E-5:
        info('Ignoring long intolerant experiments')
        return

    info('Malaria logregs experiment')

    # Command line type inference is rotten...
    logreg_C = float(logreg_C)
    logreg_tol = float(logreg_tol)
    logreg_intercept_scaling = float(logreg_intercept_scaling)
    num_cv_folds = int(num_cv_folds)
    min_fold_auc = float(min_fold_auc)
    fingerprint_folder_seed = int(fingerprint_folder_seed)
    fingerprint_fold_size = int(fingerprint_fold_size)
    chunksize = int(chunksize)

    # Example providers
    folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed,
                                                                 fold_size=fingerprint_fold_size)
    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder)
    info('Data description: %s' % rf_lab.configuration().id(full=True))

    # Experiment context: data
    data_id = rf_lab.configuration().id(full=True)
    data_dir = op.join(dest_dir, data_id)
    ensure_dir(data_dir)

    for cv_seed in cv_seeds:

        # Command line type inference is rotten...
        cv_seed = int(cv_seed)

        # Deterministic randomness
        my_rng = np.random.RandomState(seed=cv_seed)

        # Experiment context: model
        logreg_params = OrderedDict((
            ('penalty', logreg_penalty),
            ('C', logreg_C),
            ('class_weight', 'auto' if logreg_class_weight_auto else None),
            ('dual', logreg_dual),
            ('tol', logreg_tol),
            ('fit_intercept', logreg_fit_intercept),
            ('intercept_scaling', logreg_intercept_scaling),
            ('random_state', my_rng.randint(low=0, high=1000 ** 4)),
        ))
        model_setup = LogisticRegression(**logreg_params)
        model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()])
        model_dir = op.join(data_dir, model_id)
        ensure_dir(model_dir)
        info('Model: %s' % model_id)

        # Experiment context: eval
        eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds)
        eval_dir = op.join(model_dir, eval_id)
        ensure_dir(eval_dir)
        info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed))

        # Already done?
        info_file = op.join(eval_dir, 'info.json')
        if op.isfile(info_file) and not force:
            info('\tAlready done, skipping...')
            return  # Oh well, a lot have been done up to here... rework somehow

        # Anytime we see this file, we know we need to stop
        stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD')

        #---------
        #--------- Time to work!
        #---------

        # Save model config
        joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3)

        # Read labelled data in
        info('Reading data...')
        X, y = rf_lab.Xy()
        info('ne=%d; nf=%d' % rf_lab.X().shape)

        # Save molids... a bit too ad-hoc...
        save_molids(data_dir, 'lab', rf_lab.ids())
        if save_unlabelled_predictions:
            save_molids(data_dir, 'unl', rf_unl.ids())
            save_molids(data_dir, 'scr', rf_scr.ids())
            save_molids(data_dir, 'amb', rf_amb.ids())

        # Save folding information.
        # By now, all the folds have already been computed:
        #   - because we cached X
        #   - and in this case we are warranted that no new unfolded features will appear at test time
        if folder is not None:
            info('Saving the map folded_features -> unfolded_feature...')
            folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5')
            if not op.isfile(folded2unfolded_file):
                with h5py.File(folded2unfolded_file) as h5:
                    h5['f2u'] = folder.folded2unfolded()
            folder_light_file = op.join(data_dir, 'folder.pkl')
            if not op.isfile(folder_light_file):
                folder_light = copy(folder)  # Shallow copy
                folder_light.clear_cache()
                joblib.dump(folder_light, folder_light_file, compress=3)

        # Cross-val splitter
        cver = cv_splits(num_points=len(y),
                         Y=y,
                         num_folds=num_cv_folds,
                         rng=my_rng,
                         stratify=True)

        # Fit and classify
        for cv_fold_num in xrange(num_cv_folds):

            fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num)
            if op.isfile(fold_info_file):
                info('Fold %d already done, skipping' % cv_fold_num)
                continue

            if op.isfile(stop_computing_file):
                info('Bad fold detected, no more computations required')
                break

            # Split into train/test
            train_i, test_i = cver(cv_fold_num)
            Xtrain, ytrain = X[train_i, :], y[train_i]
            Xtest, ytest = X[test_i, :], y[test_i]

            # Copy the model...
            model = clone(model_setup)

            start = time()
            info('Training...')
            model.fit(Xtrain, ytrain)
            train_time = time() - start
            info('Model fitting has taken %.2f seconds' % train_time)

            if save_fold_model:
                info('Saving trained model')
                joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3)

            info('Predicting and saving results...')
            with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5:

                start = time()

                # Test indices
                h5['test_indices'] = test_i

                # Model
                h5['logreg_coef'] = model.coef_
                h5['logreg_intercept'] = model.intercept_

                # Test examples
                info('Scoring test...')
                scores_test = model.predict_proba(Xtest)
                fold_auc = roc_auc_score(ytest, scores_test[:, 1])
                fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05)
                info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc))
                info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5))
                h5['test'] = scores_test.astype(np.float32)

                if save_unlabelled_predictions:
                    predict_malaria_unlabelled(model,
                                               h5,
                                               rf_amb=rf_amb,
                                               rf_scr=rf_scr,
                                               rf_unl=rf_unl,
                                               chunksize=chunksize)

                test_time = time() - start
                info('Predicting has taken %.2f seconds' % test_time)

                # Finally save meta-information for the fold
                metainfo = mlexp_info_helper(
                    title='malaria-trees-oob',
                    data_setup=data_id,
                    model_setup=model_id,
                    exp_function=giveupthefunc(),
                )
                metainfo.update((
                    ('train_time', train_time),
                    ('test_time', test_time),
                    ('auc', fold_auc),
                    ('enrichment5', fold_enrichment5),
                ))
                with open(fold_info_file, 'w') as writer:
                    json.dump(metainfo, writer, indent=2, sort_keys=False)

                # One last thing, should we stop now?
                if fold_auc < min_fold_auc:
                    stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \
                                   (cv_fold_num, fold_auc, min_fold_auc)
                    info(stop_message)
                    with open(stop_computing_file, 'w') as writer:
                        writer.write(stop_message)

        # Summarize cross-val in the info file
        metainfo = mlexp_info_helper(
            title='malaria-trees-oob',
            data_setup=data_id,
            model_setup=model_id,
            exp_function=giveupthefunc(),
        )
        metainfo.update((
            ('num_cv_folds', num_cv_folds),
            ('cv_seed', cv_seed),
        ))
        metainfo.update(logreg_params.items())
        with open(info_file, 'w') as writer:
            json.dump(metainfo, writer, indent=2, sort_keys=False)
Example #12
0
def merge_submissions(calibrate=False,
                      select_top_scr=None,
                      with_bug=False,
                      dest_dir=MALARIA_EXPS_ROOT):
    """Very ad-hoc merge of submissions obtained with trees and logistic regressors."""

    #####
    # 0 Preparations
    #####

    # Avoid circular imports
    from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT
    from ccl_malaria.logregs_analysis import malaria_logreg_file_prefix
    from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT

    mc = MalariaCatalog()

    ensure_dir(dest_dir)

    def save_submission(sub, outfile, select_top=500):
        # Get the smiles
        smiles = mc.molids2smiless(sub.index)

        # Rankings
        ranks, (sscores, smolids, ssmiles) = \
            rank_sort(sub.values, (sub.values,
                                   sub.index.values,
                                   smiles), reverse=True, select_top=select_top)
        # Save for submission
        with open(outfile, 'w') as writer:
            for molid, smiles, score in zip(smolids, ssmiles, sscores):
                writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    #####
    # 1 Robust merge using pandas
    #####
    def read_average_merge(root, prefix):
        hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix))
        labels = mc.molids2labels(hit.index, as01=True)
        lab = hit[~np.isnan(labels)]
        amb = hit[np.isnan(labels)]
        unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix))
        scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix))
        return lab, amb, unl, scr

    tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees')
    llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT,
                                                malaria_logreg_file_prefix(with_bug=with_bug))

    lab = DataFrame({'trees': tlab, 'logregs': llab})
    lab['labels'] = mc.molids2labels(lab.index, as01=True)
    assert np.sum(np.isnan(lab['labels'])) == 0
    amb = DataFrame({'trees': tamb, 'logregs': lamb})
    unl = DataFrame({'trees': tunl, 'logregs': lunl})
    scr = DataFrame({'trees': tscr, 'logregs': lscr})

    # ATM we take it easy and just drop any NA
    lab.dropna(inplace=True)
    amb.dropna(inplace=True)
    unl.dropna(inplace=True)
    scr.dropna(inplace=True)

    #####
    # 2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion
    #####
    def calibrate_col(col):
        # isotonic not the best here, and faces numerical issues
        calibrator = IsotonicRegression(y_min=0, y_max=1)
        x = lab[~np.isnan(lab[col])][col].values
        y = lab[~np.isnan(lab[col])]['labels'].values
        # This worked with old sklearn
        try:
            # Old sklearn
            calibrator.fit(x.reshape(-1, 1), y)
            lab[col] = calibrator.predict(lab[col].values.reshape(-1, 1))
            amb[col] = calibrator.predict(amb[col].values.reshape(-1, 1))
            unl[col] = calibrator.predict(unl[col].values.reshape(-1, 1))
            scr[col] = calibrator.predict(scr[col].values.reshape(-1, 1))
        except ValueError:
            # Newer sklearn
            calibrator.fit(x.ravel(), y)
            lab[col] = calibrator.predict(lab[col].values.ravel())
            amb[col] = calibrator.predict(amb[col].values.ravel())
            unl[col] = calibrator.predict(unl[col].values.ravel())
            scr[col] = calibrator.predict(scr[col].values.ravel())

    if calibrate:
        calibrate_col('trees')
        calibrate_col('logregs')

    #####
    # 3 Average for the submission in lab-amb
    #####
    submission_lab = (lab.trees + lab.logregs) / 2
    submission_amb = (amb.trees + amb.logregs) / 2
    submission_hts = pd.concat((submission_lab, submission_amb))

    submission_options = '%s-%s' % (
        'calibrated' if calibrate else 'nonCalibrated',
        'lastFold' if with_bug else 'averageFolds')

    outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % submission_options)
    save_submission(submission_hts, outfile)

    #####
    # 4 Average predictions for unlabelled
    #####
    submission_unl_avg = (unl.trees + unl.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % submission_options)
    save_submission(submission_unl_avg, outfile, select_top=None)

    submission_scr_avg = (scr.trees + scr.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % submission_options)
    save_submission(submission_scr_avg, outfile, select_top=select_top_scr)

    #####
    # 5 Stacked (linear regression) for unlabelled
    #####
    stacker = LinearRegression()
    stacker.fit(lab[['trees', 'logregs']], lab['labels'])

    def robust_predict(X):
        X = np.asarray(X)
        row_is_finite = np.all(np.isfinite(X), axis=1)
        scores = np.full(len(X), fill_value=np.nan)
        scores[row_is_finite] = stacker.predict(X[row_is_finite])
        return scores

    # noinspection PyArgumentList
    submission_unl_st = Series(data=robust_predict(unl[['trees', 'logregs']]), index=unl.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % submission_options)
    save_submission(submission_unl_st, outfile, select_top=None)

    # noinspection PyArgumentList
    submission_scr_st = Series(data=robust_predict(scr[['trees', 'logregs']]), index=scr.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % submission_options)
    save_submission(submission_scr_st, outfile, select_top=select_top_scr)
Example #13
0
def fit(dest_dir=MALARIA_TREES_EXPERIMENT_ROOT,
        seeds=(0, 1, 2, 3, 4),
        num_treess=(10, 6000, 4000, 2000, 1000, 500, 20, 50, 100),
        save_trained_models=False,
        chunksize=200000,
        num_threads=None,
        force=False):

    # Generates OOB results

    info('Malaria trees experiment')

    # Guess the number of threads
    if num_threads is None:
        num_threads = cpu_count()
    info('Will use %d threads' % num_threads)

    # Example providers
    info('Reading data...')
    rf_lab = MalariaRDKFsExampleSet()
    X, y = rf_lab.Xy()
    rf_unl = MalariaRDKFsExampleSet(dset='unl', remove_ambiguous=False)
    rf_scr = MalariaRDKFsExampleSet(dset='scr', remove_ambiguous=False)
    rf_amb = MalariaRDKFsExampleSet(dset='amb')
    # A bit of logging
    info('Data description: %s' % rf_lab.configuration().id(nonids_too=True))
    info('ne=%d; nf=%d' % rf_lab.X().shape)

    # Experiment context: data
    data_id = rf_lab.configuration().id(nonids_too=True)  # TODO: bring hashing from oscail
    data_dir = op.join(dest_dir, data_id)
    ensure_dir(data_dir)

    # Save molids... a bit too ad-hoc...
    info('Saving molids...')

    save_molids(data_dir, 'lab', rf_lab.ids())
    save_molids(data_dir, 'unl', rf_unl.ids())
    save_molids(data_dir, 'scr', rf_scr.ids())
    save_molids(data_dir, 'amb', rf_amb.ids())

    # Main loop - TODO: robustify with try and continue
    for etc, seed, num_trees in product((True, False), seeds, num_treess):

        # Configure the model
        if etc:
            model = ExtraTreesClassifier(n_estimators=num_trees,
                                         n_jobs=num_threads,
                                         bootstrap=True,
                                         oob_score=True,
                                         random_state=seed)
        else:
            model = RandomForestClassifier(n_estimators=num_trees,
                                           n_jobs=num_threads,
                                           oob_score=True,
                                           random_state=seed)

        # Experiment context: model
        model_id = 'trees__etc=%r__num_trees=%d__seed=%d' % (etc, num_trees, seed)  # TODO: bring self-id from oscail
        model_dir = op.join(data_dir, model_id)
        ensure_dir(model_dir)
        info('Model: %s' % model_id)

        # Experiment context: eval
        eval_id = 'oob'
        eval_dir = op.join(model_dir, eval_id)
        ensure_dir(eval_dir)
        info('Eval: OOB (Out Of Bag)')

        # Already done?
        info_file = op.join(eval_dir, 'info.json')
        if op.isfile(info_file) and not force:
            info('\tAlready done, skipping...')
            continue

        # Save model config
        joblib.dump(model, op.join(model_dir, 'model_setup.pkl'), compress=3)

        # Train-full
        info('Training...')
        start = time()
        model.fit(X, y)
        train_time = time() - start  # This is also test-time, as per OOB=True

        # Save trained model? - yeah, lets do it under oob
        if save_trained_models:
            joblib.dump(model, op.join(eval_dir, 'model_trained.pkl'), compress=3)

        # OOB score, auc and enrichment
        oob_score = model.oob_score_
        oob_scores = model.oob_decision_function_
        oob_scores_not_missing = fill_missing_scores(oob_scores[:, 1])

        auc = roc_auc_score(y,  oob_scores_not_missing)
        enrichment5 = enrichment_at(y, oob_scores_not_missing, percentage=0.05)

        info('OOB AUC: %.2f' % auc)
        info('OOB Enrichment at 5%%: %.2f' % enrichment5)
        info('OOB Accuracy: %.2f' % oob_score)

        # Save scores and importances
        info('Saving results...')
        with h5py.File(op.join(eval_dir, 'oob_auc=%.2f__scores.h5' % auc), 'w') as h5:

            start = time()

            # Feature importances
            h5['f_names'] = rf_lab.fnames()
            h5['f_importances'] = model.feature_importances_

            # Labelled (development) examples
            info('Scoring lab...')
            h5['lab'] = oob_scores.astype(np.float32)

            info('Scoring amb...')
            h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32)

            # Unlabelled (competition) examples
            info('Scoring unl...')
            h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32)

            # Unlabelled (screening) examples
            info('Scoring scr...')
            if chunksize <= 0:
                h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32)
            else:
                scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32)
                for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)):
                    base = i * chunksize
                    info('\t num_scr_examples: %d' % base)
                    scr[base:base + chunksize] = model.predict_proba(x)

            test_time = time() - start

        # Finally save meta-information
        metainfo = mlexp_info_helper(
            title='malaria-trees-oob',
            data_setup=data_id,
            model_setup=model_id,
            exp_function=fit,
        )
        metainfo.update((
            ('train_time', train_time),
            ('test_time', test_time),
            ('oob_auc', auc),
            ('oob_enrichment5', enrichment5),
            ('oob_accuracy', oob_score),
        ))
        with open(info_file, 'w') as writer:
            json.dump(metainfo, writer, indent=2, sort_keys=False)
Example #14
0
def summary():
    """An example on how to manage OOB results."""
    # for result in results:
    #     print result.model_setup_id(), result.oob_auc()
    #     molids = result.ids('lab') + result.ids('amb')
    #     scores = np.vstack((result.scores('lab'), result.scores('amb')))
    #     print len(molids), len(scores)

    df = trees_results_to_pandas()
    directory = op.join(MALARIA_TREES_EXPERIMENT_ROOT, 'analysis')
    pics_dir = op.join(directory, 'figures')
    ensure_dir(directory)
    ensure_dir(pics_dir)

    print(df.columns)

    def aucs(df):
        aucss = []
        models = []
        stds = []
        for numtrees, gdf in df.groupby(['model_num_trees']):
            auc = gdf.oob_auc.mean()
            std = gdf.oob_auc.std()
            print('numtrees=%d, AUC=%.3f +/- %.3f' % (int(numtrees), auc, std))
            models.append(numtrees)
            aucss.append(auc)
            stds.append(std)
        return np.array(models), np.array(aucss), np.array(stds)

    def enrichments(df):
        enrichs = []
        models = []
        stds = []
        for numtrees, gdf in df.groupby(['model_num_trees']):
            enrich = gdf.oob_enrichment5.mean()
            std = gdf.oob_enrichment5.std()
            print('numtrees=%d, Enrichment=%.3f +/- %.3f' % (int(numtrees), enrich, std))
            models.append(numtrees)
            enrichs.append(enrich)
            stds.append(std)
        return np.array(models), np.array(enrichs), np.array(stds)

    def importances(df):
        f_names = df.result[0].f_names()
        f_importances = [res.f_importances() for res in df.result]
        return f_names, f_importances

    # noinspection PyUnusedLocal
    def f_importances_variability():
        # Do the f_importances change a lot in different seeds?
        f_names, f_importances = importances(df[df.model_num_trees == 6000])
        kendalltau_all(scores=list(enumerate(f_importances)))
        # What about the ranking of the molecules?
        kendalltau_all(scores=list(enumerate(res.scores(dset='lab')[:, 1] for res in
                                             df[((df.model_num_trees == 6000) & (df.model_seed < 2)) |
                                                ((df.model_num_trees == 100) & (df.model_seed < 2))].result)))

    # noinspection PyUnusedLocal
    def plot_auc_f_num_trees(df, show=True):
        # How does the AUC varies when we increase the number of trees?
        # How does it varies accross the different seeds?
        num_trees, aucss, stds = aucs(df)
        import matplotlib.pyplot as plt
        plt.errorbar(num_trees, aucss, yerr=stds)
        plt.ylim((0.6, 1))
        plt.xlabel('Number of trees')
        plt.ylabel('Average AUC for several random seeds')
        # Now let's add a little zoom to check what happens between AUC=0.9 and 1
        a = plt.axes([0.35, .25, .5, .3], axisbg='w')
        plt.errorbar(num_trees[aucss >= 0.92], aucss[aucss >= 0.92], yerr=stds[aucss >= 0.9])
        plt.setp(a, xticks=np.arange(0, np.max(num_trees[aucss >= 0.92])+100, 1000),
                 yticks=np.arange(0.92, np.max(aucss[aucss >= 0.92]) + 0.01, 0.02))
        if show:
            plt.show()
        plt.savefig(op.join(pics_dir, 'AUC_f_numtrees.png'), bbox_inches='tight')
        plt.savefig(op.join(pics_dir, 'AUC_f_numtrees.svg'), bbox_inches='tight')

    # noinspection PyUnusedLocal
    def plot_auc_enrichment_f_num_trees(df, show=True):
        num_trees, aucss, stds = aucs(df)
        _, enrichs, stds_enrich = enrichments(df)
        import matplotlib.pyplot as plt
        plt.errorbar(num_trees, aucss, yerr=stds)
        plt.errorbar(num_trees, enrichs, yerr=stds_enrich)
        plt.xlabel('Number of trees')
        plt.legend(['AUC', 'Enrichment'], loc='lower right')
        plt.savefig(op.join(pics_dir, 'AUC_and_enrichment_f_numtrees.png'), bbox_inches='tight')
        plt.savefig(op.join(pics_dir, 'AUC_and_enrichment_f_numtrees.svg'), bbox_inches='tight')
        if show:
            plt.show()

    # What will be the top molecules?
    # We will use the mean of uncalibrated scores for num_trees = 6000
    # noinspection PyUnusedLocal
    def final_scores(dset):
        results = df.result[df.model_num_trees == 6000]
        scores = np.mean([res.scores(dset) for res in results], axis=0)
        return scores

    def top_n_important_feats(df, num_trees=6000, n=10):
        f_names, f_importances = importances(df[df.model_num_trees == num_trees])
        # Average over the different seeds:
        f_importances = np.mean(f_importances, axis=0)
        # Little normalization to better see the differences in importances
        f_importances = (f_importances - np.min(f_importances)) / (np.max(f_importances) - np.min(f_importances))
        order = np.argsort(f_importances)
        f_names = np.array(f_names)
        f_names = f_names[order]
        f_importances = f_importances[order]
        return f_names[-n:], f_importances[-n:]

    # noinspection PyUnusedLocal
    def plot_how_many_times_in_top_n(df, n=10, show=True):
        num_experiments = 0
        occurrences_in_top_n = defaultdict(int)
        for numtrees, gdf in df.groupby(['model_num_trees']):
            num_experiments += 1
            f_names, _ = top_n_important_feats(df, num_trees=numtrees, n=n)
            for fn in f_names:
                occurrences_in_top_n[fn] += 1
        occurring_features = occurrences_in_top_n.keys()
        from matplotlib import pyplot as plt
        plt.plot(np.arange(1, len(occurring_features) + 1),
                 [occurrences_in_top_n[of]/float(num_experiments) for of in occurring_features], 'o')
        plt.ylim((0, 1.1))
        plt.xticks(np.arange(1, len(occurring_features) + 1), [of[6:] for of in occurring_features], rotation=25)
        plt.ylabel('Percentage of presence among the top %i features' % n)
        if show:
            plt.show()
        figure = plt.gcf()  # get current figure
        figure.set_size_inches(16, 6)
        plt.savefig(op.join(pics_dir, 'occurrences_features_top%i.png' % n), bbox_inches='tight', dpi=100)
        plt.savefig(op.join(pics_dir, 'occurrences_features_top%i.svg' % n), bbox_inches='tight', dpi=100)

    def plot_average_feat_importances(df, show=True):
        importancess = []
        f_names = None
        for numtrees, gdf in df.groupby(['model_num_trees']):
            f_names, f_importances = importances(df[df.model_num_trees == numtrees])
            # Average over the different seeds:
            f_importances = np.mean(f_importances, axis=0)
            # Little normalization to better see the differences in importances
            f_importances = (f_importances - np.min(f_importances)) / (np.max(f_importances) - np.min(f_importances))
            importancess.append(f_importances)
        av_imps = np.mean(np.array(importancess), axis=0)
        stds = np.std(np.array(importancess), axis=0)
        # Now we sort the features by importances, to get a nicer plot
        order = np.argsort(av_imps)
        av_imps = av_imps[order]
        stds = stds[order]
        f_names = f_names[order]
        import matplotlib.pyplot as plt
        plt.errorbar(np.arange(len(av_imps)), av_imps, yerr=stds, fmt='o')
        plt.xticks(np.arange(len(av_imps)), [f_name[6:] for f_name in f_names], rotation=90)
        plt.ylabel('Average normalized importance score')
        if show:
            plt.show()
        figure = plt.gcf()  # get current figure
        figure.set_size_inches(25, 17)
        plt.savefig(op.join(pics_dir, 'mean_feat_importances.png'))
        plt.savefig(op.join(pics_dir, 'mean_feat_importances.svg'))

    plot_average_feat_importances(df, show=True)