def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT, # Logreg params logreg_penalty='l1', logreg_C=1.0, logreg_class_weight_auto=False, logreg_dual=False, logreg_tol=1e-4, logreg_fit_intercept=True, logreg_intercept_scaling=1, # CV params num_cv_folds=10, cv_seeds=(0,), save_unlabelled_predictions=False, save_fold_model=False, min_fold_auc=0.88, # Fingerprint folding params fingerprint_folder_seed=0, fingerprint_fold_size=1023, # Computational requirements params force=False, chunksize=1000000): """Logistic regression experiment using the liblinear wrapper in sklearn. Generates cross-val results """ ### TODO Remove if logreg_tol < 1E-5: info('Ignoring long intolerant experiments') return info('Malaria logregs experiment') # Command line type inference is rotten... logreg_C = float(logreg_C) logreg_tol = float(logreg_tol) logreg_intercept_scaling = float(logreg_intercept_scaling) num_cv_folds = int(num_cv_folds) min_fold_auc = float(min_fold_auc) fingerprint_folder_seed = int(fingerprint_folder_seed) fingerprint_fold_size = int(fingerprint_fold_size) chunksize = int(chunksize) # Example providers folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed, fold_size=fingerprint_fold_size) rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder) info('Data description: %s' % rf_lab.configuration().id(full=True)) # Experiment context: data data_id = rf_lab.configuration().id(full=True) data_dir = op.join(dest_dir, data_id) ensure_dir(data_dir) for cv_seed in cv_seeds: # Command line type inference is rotten... cv_seed = int(cv_seed) # Deterministic randomness my_rng = np.random.RandomState(seed=cv_seed) # Experiment context: model logreg_params = OrderedDict(( ('penalty', logreg_penalty), ('C', logreg_C), ('class_weight', 'auto' if logreg_class_weight_auto else None), ('dual', logreg_dual), ('tol', logreg_tol), ('fit_intercept', logreg_fit_intercept), ('intercept_scaling', logreg_intercept_scaling), ('random_state', my_rng.randint(low=0, high=1000 ** 4)), )) model_setup = LogisticRegression(**logreg_params) model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()]) model_dir = op.join(data_dir, model_id) ensure_dir(model_dir) info('Model: %s' % model_id) # Experiment context: eval eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds) eval_dir = op.join(model_dir, eval_id) ensure_dir(eval_dir) info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed)) # Already done? info_file = op.join(eval_dir, 'info.json') if op.isfile(info_file) and not force: info('\tAlready done, skipping...') return # Oh well, a lot have been done up to here... rework somehow # Anytime we see this file, we know we need to stop stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD') #--------- #--------- Time to work! #--------- # Save model config joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3) # Read labelled data in info('Reading data...') X, y = rf_lab.Xy() info('ne=%d; nf=%d' % rf_lab.X().shape) # Save molids... a bit too ad-hoc... save_molids(data_dir, 'lab', rf_lab.ids()) if save_unlabelled_predictions: save_molids(data_dir, 'unl', rf_unl.ids()) save_molids(data_dir, 'scr', rf_scr.ids()) save_molids(data_dir, 'amb', rf_amb.ids()) # Save folding information. # By now, all the folds have already been computed: # - because we cached X # - and in this case we are warranted that no new unfolded features will appear at test time if folder is not None: info('Saving the map folded_features -> unfolded_feature...') folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5') if not op.isfile(folded2unfolded_file): with h5py.File(folded2unfolded_file) as h5: h5['f2u'] = folder.folded2unfolded() folder_light_file = op.join(data_dir, 'folder.pkl') if not op.isfile(folder_light_file): folder_light = copy(folder) # Shallow copy folder_light.clear_cache() joblib.dump(folder_light, folder_light_file, compress=3) # Cross-val splitter cver = cv_splits(num_points=len(y), Y=y, num_folds=num_cv_folds, rng=my_rng, stratify=True) # Fit and classify for cv_fold_num in xrange(num_cv_folds): fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num) if op.isfile(fold_info_file): info('Fold %d already done, skipping' % cv_fold_num) continue if op.isfile(stop_computing_file): info('Bad fold detected, no more computations required') break # Split into train/test train_i, test_i = cver(cv_fold_num) Xtrain, ytrain = X[train_i, :], y[train_i] Xtest, ytest = X[test_i, :], y[test_i] # Copy the model... model = clone(model_setup) start = time() info('Training...') model.fit(Xtrain, ytrain) train_time = time() - start info('Model fitting has taken %.2f seconds' % train_time) if save_fold_model: info('Saving trained model') joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3) info('Predicting and saving results...') with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5: start = time() # Test indices h5['test_indices'] = test_i # Model h5['logreg_coef'] = model.coef_ h5['logreg_intercept'] = model.intercept_ # Test examples info('Scoring test...') scores_test = model.predict_proba(Xtest) fold_auc = roc_auc_score(ytest, scores_test[:, 1]) fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05) info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc)) info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5)) h5['test'] = scores_test.astype(np.float32) if save_unlabelled_predictions: predict_malaria_unlabelled(model, h5, rf_amb=rf_amb, rf_scr=rf_scr, rf_unl=rf_unl, chunksize=chunksize) test_time = time() - start info('Predicting has taken %.2f seconds' % test_time) # Finally save meta-information for the fold metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('train_time', train_time), ('test_time', test_time), ('auc', fold_auc), ('enrichment5', fold_enrichment5), )) with open(fold_info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False) # One last thing, should we stop now? if fold_auc < min_fold_auc: stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \ (cv_fold_num, fold_auc, min_fold_auc) info(stop_message) with open(stop_computing_file, 'w') as writer: writer.write(stop_message) # Summarize cross-val in the info file metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('num_cv_folds', num_cv_folds), ('cv_seed', cv_seed), )) metainfo.update(logreg_params.items()) with open(info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False)
def fit(dest_dir=MALARIA_TREES_EXPERIMENT_ROOT, seeds=(0, 1, 2, 3, 4), num_treess=(10, 6000, 4000, 2000, 1000, 500, 20, 50, 100), save_trained_models=False, chunksize=200000, num_threads=None, force=False): # Generates OOB results info('Malaria trees experiment') # Guess the number of threads if num_threads is None: num_threads = cpu_count() info('Will use %d threads' % num_threads) # Example providers info('Reading data...') rf_lab = MalariaRDKFsExampleSet() X, y = rf_lab.Xy() rf_unl = MalariaRDKFsExampleSet(dset='unl', remove_ambiguous=False) rf_scr = MalariaRDKFsExampleSet(dset='scr', remove_ambiguous=False) rf_amb = MalariaRDKFsExampleSet(dset='amb') # A bit of logging info('Data description: %s' % rf_lab.configuration().id(nonids_too=True)) info('ne=%d; nf=%d' % rf_lab.X().shape) # Experiment context: data data_id = rf_lab.configuration().id(nonids_too=True) # TODO: bring hashing from oscail data_dir = op.join(dest_dir, data_id) ensure_dir(data_dir) # Save molids... a bit too ad-hoc... info('Saving molids...') save_molids(data_dir, 'lab', rf_lab.ids()) save_molids(data_dir, 'unl', rf_unl.ids()) save_molids(data_dir, 'scr', rf_scr.ids()) save_molids(data_dir, 'amb', rf_amb.ids()) # Main loop - TODO: robustify with try and continue for etc, seed, num_trees in product((True, False), seeds, num_treess): # Configure the model if etc: model = ExtraTreesClassifier(n_estimators=num_trees, n_jobs=num_threads, bootstrap=True, oob_score=True, random_state=seed) else: model = RandomForestClassifier(n_estimators=num_trees, n_jobs=num_threads, oob_score=True, random_state=seed) # Experiment context: model model_id = 'trees__etc=%r__num_trees=%d__seed=%d' % (etc, num_trees, seed) # TODO: bring self-id from oscail model_dir = op.join(data_dir, model_id) ensure_dir(model_dir) info('Model: %s' % model_id) # Experiment context: eval eval_id = 'oob' eval_dir = op.join(model_dir, eval_id) ensure_dir(eval_dir) info('Eval: OOB (Out Of Bag)') # Already done? info_file = op.join(eval_dir, 'info.json') if op.isfile(info_file) and not force: info('\tAlready done, skipping...') continue # Save model config joblib.dump(model, op.join(model_dir, 'model_setup.pkl'), compress=3) # Train-full info('Training...') start = time() model.fit(X, y) train_time = time() - start # This is also test-time, as per OOB=True # Save trained model? - yeah, lets do it under oob if save_trained_models: joblib.dump(model, op.join(eval_dir, 'model_trained.pkl'), compress=3) # OOB score, auc and enrichment oob_score = model.oob_score_ oob_scores = model.oob_decision_function_ oob_scores_not_missing = fill_missing_scores(oob_scores[:, 1]) auc = roc_auc_score(y, oob_scores_not_missing) enrichment5 = enrichment_at(y, oob_scores_not_missing, percentage=0.05) info('OOB AUC: %.2f' % auc) info('OOB Enrichment at 5%%: %.2f' % enrichment5) info('OOB Accuracy: %.2f' % oob_score) # Save scores and importances info('Saving results...') with h5py.File(op.join(eval_dir, 'oob_auc=%.2f__scores.h5' % auc), 'w') as h5: start = time() # Feature importances h5['f_names'] = rf_lab.fnames() h5['f_importances'] = model.feature_importances_ # Labelled (development) examples info('Scoring lab...') h5['lab'] = oob_scores.astype(np.float32) info('Scoring amb...') h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32) # Unlabelled (competition) examples info('Scoring unl...') h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32) # Unlabelled (screening) examples info('Scoring scr...') if chunksize <= 0: h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32) else: scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32) for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)): base = i * chunksize info('\t num_scr_examples: %d' % base) scr[base:base + chunksize] = model.predict_proba(x) test_time = time() - start # Finally save meta-information metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=fit, ) metainfo.update(( ('train_time', train_time), ('test_time', test_time), ('oob_auc', auc), ('oob_enrichment5', enrichment5), ('oob_accuracy', oob_score), )) with open(info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False)