def experiment_no_folding_01(compute_scores_unlabelled=True, compute_scores_screening=False, cv=True, regularization='l1', c=1.0, keep_ambiguous=False, dest_dir=op.join(MALARIA_EXPS_ROOT, 'folding_rdkit')): # We train the same models but without folding, and not using the counts but just 0 and 1 dest_dir = op.join(dest_dir, 'no_folding') X, y, classifier, _ = start_exp(dest_dir, regularization, c, keep_ambiguous) X.data = np.ones(X.data.shape) Xunl = None Xscr = None if compute_scores_unlabelled: mfpunl = MalariaFingerprintsManager(dset='unl') Xunl = mfpunl.X() Xunl = csr_matrix((np.ones(Xunl.data.shape), Xunl.indices, Xunl.indptr), shape=(Xunl.shape[0], X.shape[1])) if compute_scores_screening: mfscr = MalariaFingerprintsManager(dset='scr') Xscr = mfscr.X() Xscr = csr_matrix((np.ones(Xscr.data.shape), Xscr.indices, Xscr.indptr), shape=(Xscr.shape[0], X.shape[1])) if cv: print(X.shape, Xunl.shape) print('Cross-validating the model...') print(run_cv(dest_dir, X, y, classifier, Xunl, Xscr)) # Train the full model and predict if necessary the unlabelled and screening sets global_dir = op.join(dest_dir, 'full_model') ensure_dir(global_dir) print('Training the global classifier...') print(run_full_model(global_dir, X, y, classifier, Xunl, Xscr))
def master_experiment(fold_sizes=513, compute_scores_unlabelled=True, compute_scores_screening=False, cv=True, regularization='l1', c=1.0, keep_ambiguous=False, dest_dir=op.join(MALARIA_EXPS_ROOT, 'folding_rdkit')): """ Here we compute a scikit learn Logistic Regression for different folding sizes of the rdkit ecfp fingerprints. If asked, we do a 10-fold cross-validation and also apply the built model to the screening and/or unlabelled sets. """ dest_dir = op.join(dest_dir, 'fs=%i' % fold_sizes) print('Starting the experiment...') _, y, classifier, labelled = start_exp(dest_dir, regularization, c, keep_ambiguous) X = get_folded_fpt_sparse('lab', folding_size=fold_sizes) X.data = np.ones(X.data.shape) X = X[labelled, :] print(X.shape) print('Got the training set.') Xunl = None Xscr = None if compute_scores_unlabelled: Xunl = get_folded_fpt_sparse('unl', folding_size=fold_sizes) print('Got the unlabelled set.') print(Xunl.shape) if compute_scores_screening: Xscr = get_folded_fpt_sparse('scr', folding_size=fold_sizes) if cv: print('Cross-validating the model...') print(run_cv(dest_dir, X, y, classifier, Xunl, Xscr)) # Train the full model and predict if necessary the unlabelled and screening sets global_dir = op.join(dest_dir, 'full_model') ensure_dir(global_dir) print('Training the global classifier...') print(run_full_model(global_dir, X, y, classifier, Xunl, Xscr))
def run_cv(dest_dir, X, y, classifier, Xunl, Xscr): folds = give_cross_val_folds(y, 10) aucs = [] for i, fold in enumerate(folds): fold_dir = op.join(dest_dir, 'fold=%i' % i) ensure_dir(fold_dir) scores_unl = None scores_scr = None train_indices = np.array([j for j in range(len(y)) if j not in fold]) yte = y[fold] ytr = y[train_indices] Xte = X[fold, :] Xtr = X[train_indices, :] print('Training the classifier...') classifier.fit(Xtr, ytr) scores = classifier.predict_proba(Xte)[:, 1] if Xunl is not None: print('Scoring the unlabelled dataset...') scores_unl = classifier.predict_proba(Xunl) if Xscr is not None: print('Scoring the screening dataset...') scores_scr = classifier.predict_proba(Xscr) auc = roc_auc_score(yte, scores) aucs.append(auc) print('AUC for fold %i: %.2f' % (i, auc)) print('********************') result = [classifier, scores, fold, auc, scores_unl, scores_scr] with open(op.join(fold_dir, 'results.pkl'), 'w') as writer: pickle.dump(result, writer) # noinspection PyStringFormat print('Average AUC: %.2f +/- %.2f' % (np.mean(np.array(aucs)), np.std(np.array(aucs)))) print('********************')
def __init__(self, molid2i, root, prefix, data2molid, chunksize=10000): super(Chihuahua, self).__init__() self.chunksize = chunksize self.molid2i = molid2i self.num_mols = len(self.molid2i) self.temp_fns = [op.join(root, '%s-%d' % (prefix, base)) for base in xrange(0, self.num_mols, chunksize)] self.temp_files = [open(fn, 'w') for fn in self.temp_fns] self.data2molid = data2molid self.root = root self.prefix = prefix ensure_dir(self.root)
def start_exp(dest_dir, regularization, c, keep_ambiguous): """ Define the classifier. Get the non-folded data. """ ensure_dir(dest_dir) classifier = LogisticRegression(penalty=regularization, C=c, class_weight='auto') mfp = MalariaFingerprintsManager(dset='lab') X, y = mfp.Xy() if not keep_ambiguous: labelled = ~np.isnan(y) X = X[labelled, :] y = y[labelled] else: labelled = np.ones(len(y)) return X, y, classifier, labelled
def ecfps_mp(numjobs=None, dest_dir=None): """Python-parallel computation of ECFPs. Parameters: - numjobs: the number of threads to use (None=all in the machine). - dest_dir: the directory to which the fingerprints will be written, in weird fp format(TM). """ dest_dir = _MALARIA_ECFPS_PARALLEL_RESULTS_DIR if dest_dir is None else dest_dir ensure_dir(dest_dir) numjobs = cpu_count() if numjobs is None else int(numjobs) Parallel(n_jobs=numjobs)(delayed(_molidsmiles_it_ecfp) (start=start, step=numjobs, output_file=op.join(dest_dir, 'all__fcfp=%r__start=%d__step=%d.weirdfps' % (fcfp, start, numjobs)), fcfp=fcfp) for start, fcfp in product(range(numjobs), (True, False)))
def ecfps(start=0, step=46, mols='lab', output_file=None, fcfp=True): """Entry point for the command line to generate fingerprints. Parameters: - start: the index of the first molecule to consider - step: how many molecules are skipped in each iteration - mols: an iterator over pairs (molid, smiles) or a string ('lab'|'unl'|'scr'|'all') to use one of TDT malaria's iterators - fcfp: generate FCFPs or ECFPs - output_file: the file to which the fingerprints will be written, in weird fp format(TM). """ if isinstance(mols, basestring): mols = MOLS2MOLS[mols]() ensure_dir(op.dirname(output_file)) _molidsmiles_it(start=start, step=step, mols=mols, processor=_ecfp_writer(output_file=output_file, fcfp=fcfp))
def _rdkfeats_writer(output_file=None, features=None): """Returns a (molindex, molid, smiles) processor that computes descriptors using RDKit and stores then in a h5 file. Parameters: - output_file: where the descriptors will be written; this file will be overwritten. - features: a list of the names of the RDKit features that will be computed (by default all the descriptors exposed by the Descriptor class in RDKit) Returns: - a processor function ready to be used as a parameter to _molidsmiles_it. The h5 file has the following data: - 'rdkdescs': a float matrix num_mols x num_descs this will all be nans if the computation failed completely - 'fnames': the name of the feature in each column (num_cols) - 'molids': the molid corresponding to each row (num_rows) """ ensure_dir(op.dirname(output_file)) h5 = h5py.File(output_file, mode='w', dtype=np.float32) computer = RDKitDescriptorsComputer(features) fnames = computer.fnames() nf = len(fnames) descs = h5.create_dataset('rdkdescs', (0, nf), maxshape=(None, nf), compression='lzf') str_type = h5py.new_vlen(str) h5.create_dataset('fnames', data=fnames) molids = h5.create_dataset('molids', shape=(0,), maxshape=(None,), dtype=str_type) def process(molid, smiles): if molid is _END_MOLID: h5.close() return ne = len(molids) try: molids.resize((ne + 1,)) molids[ne] = molid mol = to_rdkit_mol(smiles) descs.resize((ne + 1, nf)) descs[ne, :] = computer.compute(mol)[0] except: info('Failed molecule %s: %s' % (molid, smiles)) descs[ne, :] = [np.nan] * nf return process
def __init__(self, root_dir): """Quick random access to collections of molecules in disk, using molids. Caveat: great for random access, not suitable for streaming purposes. All read molecules stay in memory until (all) handles to this memmap are closed. """ # Where the index resides... self._root = root_dir ensure_dir(self._root) # Index {molid -> (start, numbytes)} self._molids_file = op.join(self._root, 'molids.txt') self._coords_file = op.join(self._root, 'coords.npy') self._molids = None self._coords = None self._molid2coords = None # The serialized molecules self._data_file = op.join(self._root, 'molsdata') self._filehandle = None self._molsdata = None
""" import logging import os.path as op from minioscail.common.misc import ensure_dir __version__ = '0.2-dev0' # --- Paths and other constants. # Make everything relative to the source location... _THIS_PATH = op.abspath(op.dirname(__file__)) # maybe jump to pkgutils? # Where the data resides MALARIA_DATA_ROOT = op.abspath(op.join(_THIS_PATH, '..', '..', 'data')) # The original downloaded files will come here MALARIA_ORIGINAL_DATA_ROOT = op.join(MALARIA_DATA_ROOT, 'original') ensure_dir(MALARIA_ORIGINAL_DATA_ROOT) # Different indices (like molid -> smiles) come here MALARIA_INDICES_ROOT = op.join(MALARIA_DATA_ROOT, 'indices') ensure_dir(MALARIA_INDICES_ROOT) # Experiment results come here MALARIA_EXPS_ROOT = op.join(MALARIA_DATA_ROOT, 'experiments') ensure_dir(MALARIA_EXPS_ROOT) # --- Common logger for the malaria code. _logger = logging.getLogger('malaria') _logger.setLevel(logging.DEBUG) debug = _logger.debug info = _logger.info warning = _logger.warning error = _logger.error
def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT, # Logreg params logreg_penalty='l1', logreg_C=1.0, logreg_class_weight_auto=False, logreg_dual=False, logreg_tol=1e-4, logreg_fit_intercept=True, logreg_intercept_scaling=1, # CV params num_cv_folds=10, cv_seeds=(0,), save_unlabelled_predictions=False, save_fold_model=False, min_fold_auc=0.88, # Fingerprint folding params fingerprint_folder_seed=0, fingerprint_fold_size=1023, # Computational requirements params force=False, chunksize=1000000): """Logistic regression experiment using the liblinear wrapper in sklearn. Generates cross-val results """ ### TODO Remove if logreg_tol < 1E-5: info('Ignoring long intolerant experiments') return info('Malaria logregs experiment') # Command line type inference is rotten... logreg_C = float(logreg_C) logreg_tol = float(logreg_tol) logreg_intercept_scaling = float(logreg_intercept_scaling) num_cv_folds = int(num_cv_folds) min_fold_auc = float(min_fold_auc) fingerprint_folder_seed = int(fingerprint_folder_seed) fingerprint_fold_size = int(fingerprint_fold_size) chunksize = int(chunksize) # Example providers folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed, fold_size=fingerprint_fold_size) rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder) info('Data description: %s' % rf_lab.configuration().id(full=True)) # Experiment context: data data_id = rf_lab.configuration().id(full=True) data_dir = op.join(dest_dir, data_id) ensure_dir(data_dir) for cv_seed in cv_seeds: # Command line type inference is rotten... cv_seed = int(cv_seed) # Deterministic randomness my_rng = np.random.RandomState(seed=cv_seed) # Experiment context: model logreg_params = OrderedDict(( ('penalty', logreg_penalty), ('C', logreg_C), ('class_weight', 'auto' if logreg_class_weight_auto else None), ('dual', logreg_dual), ('tol', logreg_tol), ('fit_intercept', logreg_fit_intercept), ('intercept_scaling', logreg_intercept_scaling), ('random_state', my_rng.randint(low=0, high=1000 ** 4)), )) model_setup = LogisticRegression(**logreg_params) model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()]) model_dir = op.join(data_dir, model_id) ensure_dir(model_dir) info('Model: %s' % model_id) # Experiment context: eval eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds) eval_dir = op.join(model_dir, eval_id) ensure_dir(eval_dir) info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed)) # Already done? info_file = op.join(eval_dir, 'info.json') if op.isfile(info_file) and not force: info('\tAlready done, skipping...') return # Oh well, a lot have been done up to here... rework somehow # Anytime we see this file, we know we need to stop stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD') #--------- #--------- Time to work! #--------- # Save model config joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3) # Read labelled data in info('Reading data...') X, y = rf_lab.Xy() info('ne=%d; nf=%d' % rf_lab.X().shape) # Save molids... a bit too ad-hoc... save_molids(data_dir, 'lab', rf_lab.ids()) if save_unlabelled_predictions: save_molids(data_dir, 'unl', rf_unl.ids()) save_molids(data_dir, 'scr', rf_scr.ids()) save_molids(data_dir, 'amb', rf_amb.ids()) # Save folding information. # By now, all the folds have already been computed: # - because we cached X # - and in this case we are warranted that no new unfolded features will appear at test time if folder is not None: info('Saving the map folded_features -> unfolded_feature...') folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5') if not op.isfile(folded2unfolded_file): with h5py.File(folded2unfolded_file) as h5: h5['f2u'] = folder.folded2unfolded() folder_light_file = op.join(data_dir, 'folder.pkl') if not op.isfile(folder_light_file): folder_light = copy(folder) # Shallow copy folder_light.clear_cache() joblib.dump(folder_light, folder_light_file, compress=3) # Cross-val splitter cver = cv_splits(num_points=len(y), Y=y, num_folds=num_cv_folds, rng=my_rng, stratify=True) # Fit and classify for cv_fold_num in xrange(num_cv_folds): fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num) if op.isfile(fold_info_file): info('Fold %d already done, skipping' % cv_fold_num) continue if op.isfile(stop_computing_file): info('Bad fold detected, no more computations required') break # Split into train/test train_i, test_i = cver(cv_fold_num) Xtrain, ytrain = X[train_i, :], y[train_i] Xtest, ytest = X[test_i, :], y[test_i] # Copy the model... model = clone(model_setup) start = time() info('Training...') model.fit(Xtrain, ytrain) train_time = time() - start info('Model fitting has taken %.2f seconds' % train_time) if save_fold_model: info('Saving trained model') joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3) info('Predicting and saving results...') with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5: start = time() # Test indices h5['test_indices'] = test_i # Model h5['logreg_coef'] = model.coef_ h5['logreg_intercept'] = model.intercept_ # Test examples info('Scoring test...') scores_test = model.predict_proba(Xtest) fold_auc = roc_auc_score(ytest, scores_test[:, 1]) fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05) info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc)) info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5)) h5['test'] = scores_test.astype(np.float32) if save_unlabelled_predictions: predict_malaria_unlabelled(model, h5, rf_amb=rf_amb, rf_scr=rf_scr, rf_unl=rf_unl, chunksize=chunksize) test_time = time() - start info('Predicting has taken %.2f seconds' % test_time) # Finally save meta-information for the fold metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('train_time', train_time), ('test_time', test_time), ('auc', fold_auc), ('enrichment5', fold_enrichment5), )) with open(fold_info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False) # One last thing, should we stop now? if fold_auc < min_fold_auc: stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \ (cv_fold_num, fold_auc, min_fold_auc) info(stop_message) with open(stop_computing_file, 'w') as writer: writer.write(stop_message) # Summarize cross-val in the info file metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('num_cv_folds', num_cv_folds), ('cv_seed', cv_seed), )) metainfo.update(logreg_params.items()) with open(info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False)
def merge_submissions(calibrate=False, select_top_scr=None, with_bug=False, dest_dir=MALARIA_EXPS_ROOT): """Very ad-hoc merge of submissions obtained with trees and logistic regressors.""" ##### # 0 Preparations ##### # Avoid circular imports from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT from ccl_malaria.logregs_analysis import malaria_logreg_file_prefix from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT mc = MalariaCatalog() ensure_dir(dest_dir) def save_submission(sub, outfile, select_top=500): # Get the smiles smiles = mc.molids2smiless(sub.index) # Rankings ranks, (sscores, smolids, ssmiles) = \ rank_sort(sub.values, (sub.values, sub.index.values, smiles), reverse=True, select_top=select_top) # Save for submission with open(outfile, 'w') as writer: for molid, smiles, score in zip(smolids, ssmiles, sscores): writer.write('%s,%s,%.6f\n' % (molid, smiles, score)) ##### # 1 Robust merge using pandas ##### def read_average_merge(root, prefix): hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix)) labels = mc.molids2labels(hit.index, as01=True) lab = hit[~np.isnan(labels)] amb = hit[np.isnan(labels)] unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix)) scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix)) return lab, amb, unl, scr tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees') llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT, malaria_logreg_file_prefix(with_bug=with_bug)) lab = DataFrame({'trees': tlab, 'logregs': llab}) lab['labels'] = mc.molids2labels(lab.index, as01=True) assert np.sum(np.isnan(lab['labels'])) == 0 amb = DataFrame({'trees': tamb, 'logregs': lamb}) unl = DataFrame({'trees': tunl, 'logregs': lunl}) scr = DataFrame({'trees': tscr, 'logregs': lscr}) # ATM we take it easy and just drop any NA lab.dropna(inplace=True) amb.dropna(inplace=True) unl.dropna(inplace=True) scr.dropna(inplace=True) ##### # 2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion ##### def calibrate_col(col): # isotonic not the best here, and faces numerical issues calibrator = IsotonicRegression(y_min=0, y_max=1) x = lab[~np.isnan(lab[col])][col].values y = lab[~np.isnan(lab[col])]['labels'].values # This worked with old sklearn try: # Old sklearn calibrator.fit(x.reshape(-1, 1), y) lab[col] = calibrator.predict(lab[col].values.reshape(-1, 1)) amb[col] = calibrator.predict(amb[col].values.reshape(-1, 1)) unl[col] = calibrator.predict(unl[col].values.reshape(-1, 1)) scr[col] = calibrator.predict(scr[col].values.reshape(-1, 1)) except ValueError: # Newer sklearn calibrator.fit(x.ravel(), y) lab[col] = calibrator.predict(lab[col].values.ravel()) amb[col] = calibrator.predict(amb[col].values.ravel()) unl[col] = calibrator.predict(unl[col].values.ravel()) scr[col] = calibrator.predict(scr[col].values.ravel()) if calibrate: calibrate_col('trees') calibrate_col('logregs') ##### # 3 Average for the submission in lab-amb ##### submission_lab = (lab.trees + lab.logregs) / 2 submission_amb = (amb.trees + amb.logregs) / 2 submission_hts = pd.concat((submission_lab, submission_amb)) submission_options = '%s-%s' % ( 'calibrated' if calibrate else 'nonCalibrated', 'lastFold' if with_bug else 'averageFolds') outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % submission_options) save_submission(submission_hts, outfile) ##### # 4 Average predictions for unlabelled ##### submission_unl_avg = (unl.trees + unl.logregs) / 2 outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % submission_options) save_submission(submission_unl_avg, outfile, select_top=None) submission_scr_avg = (scr.trees + scr.logregs) / 2 outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % submission_options) save_submission(submission_scr_avg, outfile, select_top=select_top_scr) ##### # 5 Stacked (linear regression) for unlabelled ##### stacker = LinearRegression() stacker.fit(lab[['trees', 'logregs']], lab['labels']) def robust_predict(X): X = np.asarray(X) row_is_finite = np.all(np.isfinite(X), axis=1) scores = np.full(len(X), fill_value=np.nan) scores[row_is_finite] = stacker.predict(X[row_is_finite]) return scores # noinspection PyArgumentList submission_unl_st = Series(data=robust_predict(unl[['trees', 'logregs']]), index=unl.index) outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % submission_options) save_submission(submission_unl_st, outfile, select_top=None) # noinspection PyArgumentList submission_scr_st = Series(data=robust_predict(scr[['trees', 'logregs']]), index=scr.index) outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % submission_options) save_submission(submission_scr_st, outfile, select_top=select_top_scr)
def fit(dest_dir=MALARIA_TREES_EXPERIMENT_ROOT, seeds=(0, 1, 2, 3, 4), num_treess=(10, 6000, 4000, 2000, 1000, 500, 20, 50, 100), save_trained_models=False, chunksize=200000, num_threads=None, force=False): # Generates OOB results info('Malaria trees experiment') # Guess the number of threads if num_threads is None: num_threads = cpu_count() info('Will use %d threads' % num_threads) # Example providers info('Reading data...') rf_lab = MalariaRDKFsExampleSet() X, y = rf_lab.Xy() rf_unl = MalariaRDKFsExampleSet(dset='unl', remove_ambiguous=False) rf_scr = MalariaRDKFsExampleSet(dset='scr', remove_ambiguous=False) rf_amb = MalariaRDKFsExampleSet(dset='amb') # A bit of logging info('Data description: %s' % rf_lab.configuration().id(nonids_too=True)) info('ne=%d; nf=%d' % rf_lab.X().shape) # Experiment context: data data_id = rf_lab.configuration().id(nonids_too=True) # TODO: bring hashing from oscail data_dir = op.join(dest_dir, data_id) ensure_dir(data_dir) # Save molids... a bit too ad-hoc... info('Saving molids...') save_molids(data_dir, 'lab', rf_lab.ids()) save_molids(data_dir, 'unl', rf_unl.ids()) save_molids(data_dir, 'scr', rf_scr.ids()) save_molids(data_dir, 'amb', rf_amb.ids()) # Main loop - TODO: robustify with try and continue for etc, seed, num_trees in product((True, False), seeds, num_treess): # Configure the model if etc: model = ExtraTreesClassifier(n_estimators=num_trees, n_jobs=num_threads, bootstrap=True, oob_score=True, random_state=seed) else: model = RandomForestClassifier(n_estimators=num_trees, n_jobs=num_threads, oob_score=True, random_state=seed) # Experiment context: model model_id = 'trees__etc=%r__num_trees=%d__seed=%d' % (etc, num_trees, seed) # TODO: bring self-id from oscail model_dir = op.join(data_dir, model_id) ensure_dir(model_dir) info('Model: %s' % model_id) # Experiment context: eval eval_id = 'oob' eval_dir = op.join(model_dir, eval_id) ensure_dir(eval_dir) info('Eval: OOB (Out Of Bag)') # Already done? info_file = op.join(eval_dir, 'info.json') if op.isfile(info_file) and not force: info('\tAlready done, skipping...') continue # Save model config joblib.dump(model, op.join(model_dir, 'model_setup.pkl'), compress=3) # Train-full info('Training...') start = time() model.fit(X, y) train_time = time() - start # This is also test-time, as per OOB=True # Save trained model? - yeah, lets do it under oob if save_trained_models: joblib.dump(model, op.join(eval_dir, 'model_trained.pkl'), compress=3) # OOB score, auc and enrichment oob_score = model.oob_score_ oob_scores = model.oob_decision_function_ oob_scores_not_missing = fill_missing_scores(oob_scores[:, 1]) auc = roc_auc_score(y, oob_scores_not_missing) enrichment5 = enrichment_at(y, oob_scores_not_missing, percentage=0.05) info('OOB AUC: %.2f' % auc) info('OOB Enrichment at 5%%: %.2f' % enrichment5) info('OOB Accuracy: %.2f' % oob_score) # Save scores and importances info('Saving results...') with h5py.File(op.join(eval_dir, 'oob_auc=%.2f__scores.h5' % auc), 'w') as h5: start = time() # Feature importances h5['f_names'] = rf_lab.fnames() h5['f_importances'] = model.feature_importances_ # Labelled (development) examples info('Scoring lab...') h5['lab'] = oob_scores.astype(np.float32) info('Scoring amb...') h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32) # Unlabelled (competition) examples info('Scoring unl...') h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32) # Unlabelled (screening) examples info('Scoring scr...') if chunksize <= 0: h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32) else: scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32) for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)): base = i * chunksize info('\t num_scr_examples: %d' % base) scr[base:base + chunksize] = model.predict_proba(x) test_time = time() - start # Finally save meta-information metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=fit, ) metainfo.update(( ('train_time', train_time), ('test_time', test_time), ('oob_auc', auc), ('oob_enrichment5', enrichment5), ('oob_accuracy', oob_score), )) with open(info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False)
def summary(): """An example on how to manage OOB results.""" # for result in results: # print result.model_setup_id(), result.oob_auc() # molids = result.ids('lab') + result.ids('amb') # scores = np.vstack((result.scores('lab'), result.scores('amb'))) # print len(molids), len(scores) df = trees_results_to_pandas() directory = op.join(MALARIA_TREES_EXPERIMENT_ROOT, 'analysis') pics_dir = op.join(directory, 'figures') ensure_dir(directory) ensure_dir(pics_dir) print(df.columns) def aucs(df): aucss = [] models = [] stds = [] for numtrees, gdf in df.groupby(['model_num_trees']): auc = gdf.oob_auc.mean() std = gdf.oob_auc.std() print('numtrees=%d, AUC=%.3f +/- %.3f' % (int(numtrees), auc, std)) models.append(numtrees) aucss.append(auc) stds.append(std) return np.array(models), np.array(aucss), np.array(stds) def enrichments(df): enrichs = [] models = [] stds = [] for numtrees, gdf in df.groupby(['model_num_trees']): enrich = gdf.oob_enrichment5.mean() std = gdf.oob_enrichment5.std() print('numtrees=%d, Enrichment=%.3f +/- %.3f' % (int(numtrees), enrich, std)) models.append(numtrees) enrichs.append(enrich) stds.append(std) return np.array(models), np.array(enrichs), np.array(stds) def importances(df): f_names = df.result[0].f_names() f_importances = [res.f_importances() for res in df.result] return f_names, f_importances # noinspection PyUnusedLocal def f_importances_variability(): # Do the f_importances change a lot in different seeds? f_names, f_importances = importances(df[df.model_num_trees == 6000]) kendalltau_all(scores=list(enumerate(f_importances))) # What about the ranking of the molecules? kendalltau_all(scores=list(enumerate(res.scores(dset='lab')[:, 1] for res in df[((df.model_num_trees == 6000) & (df.model_seed < 2)) | ((df.model_num_trees == 100) & (df.model_seed < 2))].result))) # noinspection PyUnusedLocal def plot_auc_f_num_trees(df, show=True): # How does the AUC varies when we increase the number of trees? # How does it varies accross the different seeds? num_trees, aucss, stds = aucs(df) import matplotlib.pyplot as plt plt.errorbar(num_trees, aucss, yerr=stds) plt.ylim((0.6, 1)) plt.xlabel('Number of trees') plt.ylabel('Average AUC for several random seeds') # Now let's add a little zoom to check what happens between AUC=0.9 and 1 a = plt.axes([0.35, .25, .5, .3], axisbg='w') plt.errorbar(num_trees[aucss >= 0.92], aucss[aucss >= 0.92], yerr=stds[aucss >= 0.9]) plt.setp(a, xticks=np.arange(0, np.max(num_trees[aucss >= 0.92])+100, 1000), yticks=np.arange(0.92, np.max(aucss[aucss >= 0.92]) + 0.01, 0.02)) if show: plt.show() plt.savefig(op.join(pics_dir, 'AUC_f_numtrees.png'), bbox_inches='tight') plt.savefig(op.join(pics_dir, 'AUC_f_numtrees.svg'), bbox_inches='tight') # noinspection PyUnusedLocal def plot_auc_enrichment_f_num_trees(df, show=True): num_trees, aucss, stds = aucs(df) _, enrichs, stds_enrich = enrichments(df) import matplotlib.pyplot as plt plt.errorbar(num_trees, aucss, yerr=stds) plt.errorbar(num_trees, enrichs, yerr=stds_enrich) plt.xlabel('Number of trees') plt.legend(['AUC', 'Enrichment'], loc='lower right') plt.savefig(op.join(pics_dir, 'AUC_and_enrichment_f_numtrees.png'), bbox_inches='tight') plt.savefig(op.join(pics_dir, 'AUC_and_enrichment_f_numtrees.svg'), bbox_inches='tight') if show: plt.show() # What will be the top molecules? # We will use the mean of uncalibrated scores for num_trees = 6000 # noinspection PyUnusedLocal def final_scores(dset): results = df.result[df.model_num_trees == 6000] scores = np.mean([res.scores(dset) for res in results], axis=0) return scores def top_n_important_feats(df, num_trees=6000, n=10): f_names, f_importances = importances(df[df.model_num_trees == num_trees]) # Average over the different seeds: f_importances = np.mean(f_importances, axis=0) # Little normalization to better see the differences in importances f_importances = (f_importances - np.min(f_importances)) / (np.max(f_importances) - np.min(f_importances)) order = np.argsort(f_importances) f_names = np.array(f_names) f_names = f_names[order] f_importances = f_importances[order] return f_names[-n:], f_importances[-n:] # noinspection PyUnusedLocal def plot_how_many_times_in_top_n(df, n=10, show=True): num_experiments = 0 occurrences_in_top_n = defaultdict(int) for numtrees, gdf in df.groupby(['model_num_trees']): num_experiments += 1 f_names, _ = top_n_important_feats(df, num_trees=numtrees, n=n) for fn in f_names: occurrences_in_top_n[fn] += 1 occurring_features = occurrences_in_top_n.keys() from matplotlib import pyplot as plt plt.plot(np.arange(1, len(occurring_features) + 1), [occurrences_in_top_n[of]/float(num_experiments) for of in occurring_features], 'o') plt.ylim((0, 1.1)) plt.xticks(np.arange(1, len(occurring_features) + 1), [of[6:] for of in occurring_features], rotation=25) plt.ylabel('Percentage of presence among the top %i features' % n) if show: plt.show() figure = plt.gcf() # get current figure figure.set_size_inches(16, 6) plt.savefig(op.join(pics_dir, 'occurrences_features_top%i.png' % n), bbox_inches='tight', dpi=100) plt.savefig(op.join(pics_dir, 'occurrences_features_top%i.svg' % n), bbox_inches='tight', dpi=100) def plot_average_feat_importances(df, show=True): importancess = [] f_names = None for numtrees, gdf in df.groupby(['model_num_trees']): f_names, f_importances = importances(df[df.model_num_trees == numtrees]) # Average over the different seeds: f_importances = np.mean(f_importances, axis=0) # Little normalization to better see the differences in importances f_importances = (f_importances - np.min(f_importances)) / (np.max(f_importances) - np.min(f_importances)) importancess.append(f_importances) av_imps = np.mean(np.array(importancess), axis=0) stds = np.std(np.array(importancess), axis=0) # Now we sort the features by importances, to get a nicer plot order = np.argsort(av_imps) av_imps = av_imps[order] stds = stds[order] f_names = f_names[order] import matplotlib.pyplot as plt plt.errorbar(np.arange(len(av_imps)), av_imps, yerr=stds, fmt='o') plt.xticks(np.arange(len(av_imps)), [f_name[6:] for f_name in f_names], rotation=90) plt.ylabel('Average normalized importance score') if show: plt.show() figure = plt.gcf() # get current figure figure.set_size_inches(25, 17) plt.savefig(op.join(pics_dir, 'mean_feat_importances.png')) plt.savefig(op.join(pics_dir, 'mean_feat_importances.svg')) plot_average_feat_importances(df, show=True)