def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True): X, y, sample_weight = generate_classification_data(distance=0.6) assert classifier == classifier.fit(X, y, sample_weight=sample_weight) assert list(classifier.features) == list(X.columns) check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp, has_importances=has_importances) def mean_vote(x): return numpy.mean(x, axis=0) labels = classifier.predict(X, mean_vote) proba = classifier.predict_proba(X, mean_vote) assert numpy.all(proba == classifier.predict_proba(X, mean_vote)) score = accuracy_score(y, labels) print(score) assert score > 0.7 assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(proba >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, proba[:, 1]) print(auc_score) assert auc_score > 0.8 if has_staged_pp: for p in classifier.staged_predict_proba(X, mean_vote): assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba)
def test_quality(n_samples=3000): testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_variables': ['column0'], 'base_estimator': DecisionTreeClassifier(min_samples_leaf=20, max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=5, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) predict_proba = classifier.predict_proba(testX) predict = classifier.predict(testX) assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \ "quality is awful" print("Accuracy = %.3f" % accuracy_score(testY, predict))
def test_categorical_gb(n_samples=100000, n_features=10, p=0.7): y = numpy.random.random(n_samples) > 0.5 X = numpy.random.randint(40, size=[n_samples, n_features]) * 2 X += numpy.random.random(size=[n_samples, n_features]) > p X += y[:, numpy.newaxis] from sklearn.cross_validation import train_test_split trainX, testX, trainY, testY = train_test_split(X, y) boosters = { 'old': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5), 'cat': CommonGradientBoosting(loss=AdaLossFunction(), subsample=0.5, dtype=int, base_estimator=CategoricalTreeRegressor()), 'cat2': TreeGradientBoostingClassifier(loss=BinomialDeviance(), dtype='int', update_tree=False, base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=3, method='cv')), 'cat3': TreeGradientBoostingClassifier(loss=BinomialDeviance(), dtype='int', update_tree=False, base_estimator=ObliviousCategoricalRegressor(n_features=10, n_categories_power=5, splits=1, pfactor=0.5)), 'cat2-2': TreeGradientBoostingClassifier(loss=BinomialDeviance(), dtype='int', update_tree=False, n_threads=2, base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=1)), 'cat-linear': CategoricalLinearClassifier(), } for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) check_report_with_mask(report, "column0 > %f" % (val / 2.), X) check_report_with_mask(report, lambda x: numpy.array(x['column0']) < val * 2., X) check_report_with_mask(report, None, X)
def test_workability(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) for booster in [FoldingGBClassifier, TreeGradientBoostingClassifier]: for loss in [BinomialDeviance(), AdaLossFunction()]: for update in [True, False]: for base in [FastTreeRegressor(max_depth=3), FastNeuroTreeRegressor(max_depth=3)]: if numpy.random.random() > 0.7: clf = booster(loss=loss, n_estimators=100, base_estimator=base, update_tree=update) clf.fit(trainX, trainY) auc = roc_auc_score(testY, clf.predict_proba(testX)[:, 1]) print('booster', booster, loss, 'update=', update, ' base=', base.__class__, ' quality=', auc) assert auc > 0.8
def generate_split_result(model_config, X, y, split_id, splitter, cvgroup, reraise=False): # Splitting train_indices, trest_indices = splitter.split() fold_group = cvgroup.require_group(split_id) try: fold_group.attrs['config'] = splitter.what().id() fold_group.create_dataset('test_indices', data=trest_indices, compression=compression) # uncompressible fold_group.create_dataset('y_test', data=y[trest_indices], compression=compression) except: pass # Dodgy # Model configuration model_group = fold_group.require_group('model=%s' % model_config.nickname) try: # already done? if 'DONE' in fold_group.keys(): print '%s already done, skipping...' % model_group.name return if 'FAILED' in fold_group.keys(): print '%s already failed, skipping...' % model_group.name return # compute the result scores, model, train_time, test_time = \ train_test(model_config.seed_model(expid), X, y, train_indices, trest_indices) # save scores, auc, times try: model_group.attrs['auc'] = roc_auc_score(y[trest_indices], scores) except: model_group.attrs['auc'] = None model_group.attrs['train_time'] = train_time model_group.attrs['test_time'] = test_time model_group.create_dataset('test_scores', data=scores, compression=compression) # save whatever from the model model_config.storer.to_hdf5(model, model_group, compression=compression) # done model_group['DONE'] = 'Finished on %s' % strftime("%c") except Exception: model_group['FAILED'] = format_exc() if reraise: raise
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) # Multiplying by random matrix multiplier = numpy.random.normal(size=[n_features, n_features]) shift = numpy.random.normal(size=[1, n_features]) * 5 trainX = numpy.dot(trainX.values, multiplier) + shift testX = numpy.dot(testX.values, multiplier) + shift boosters = { 'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3), 'fast+old_tree': CommonGradientBoosting(n_estimators=100, base_estimator=DecisionTreeRegressor(min_samples_split=50, max_depth=5)), 'fast+neuro': TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True, base_estimator=FastNeuroTreeRegressor()), 'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(), n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3) } for criterion in ['mse', # 'fmse', # 'pvalue', # 'significance', 'significance2', # 'gini', 'entropy', 'poisson' ]: boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastTreeRegressor(criterion=criterion)) for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
def test_refitting(n_samples=10000, n_features=10, distance=0.5): trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) booster = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True, base_estimator=FastTreeRegressor()) booster.fit(trainX, trainY) print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1])) print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1])) booster.refit_trees(trainX, trainY) print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1])) print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1])) booster.refit_trees(testX, testY) print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1])) print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))
def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT, # Logreg params logreg_penalty='l1', logreg_C=1.0, logreg_class_weight_auto=False, logreg_dual=False, logreg_tol=1e-4, logreg_fit_intercept=True, logreg_intercept_scaling=1, # CV params num_cv_folds=10, cv_seeds=(0,), save_unlabelled_predictions=False, save_fold_model=False, min_fold_auc=0.88, # Fingerprint folding params fingerprint_folder_seed=0, fingerprint_fold_size=1023, # Computational requirements params force=False, chunksize=1000000): """Logistic regression experiment using the liblinear wrapper in sklearn. Generates cross-val results """ ### TODO Remove if logreg_tol < 1E-5: info('Ignoring long intolerant experiments') return info('Malaria logregs experiment') # Command line type inference is rotten... logreg_C = float(logreg_C) logreg_tol = float(logreg_tol) logreg_intercept_scaling = float(logreg_intercept_scaling) num_cv_folds = int(num_cv_folds) min_fold_auc = float(min_fold_auc) fingerprint_folder_seed = int(fingerprint_folder_seed) fingerprint_fold_size = int(fingerprint_fold_size) chunksize = int(chunksize) # Example providers folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed, fold_size=fingerprint_fold_size) rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder) info('Data description: %s' % rf_lab.configuration().id(full=True)) # Experiment context: data data_id = rf_lab.configuration().id(full=True) data_dir = op.join(dest_dir, data_id) ensure_dir(data_dir) for cv_seed in cv_seeds: # Command line type inference is rotten... cv_seed = int(cv_seed) # Deterministic randomness my_rng = np.random.RandomState(seed=cv_seed) # Experiment context: model logreg_params = OrderedDict(( ('penalty', logreg_penalty), ('C', logreg_C), ('class_weight', 'auto' if logreg_class_weight_auto else None), ('dual', logreg_dual), ('tol', logreg_tol), ('fit_intercept', logreg_fit_intercept), ('intercept_scaling', logreg_intercept_scaling), ('random_state', my_rng.randint(low=0, high=1000 ** 4)), )) model_setup = LogisticRegression(**logreg_params) model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()]) model_dir = op.join(data_dir, model_id) ensure_dir(model_dir) info('Model: %s' % model_id) # Experiment context: eval eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds) eval_dir = op.join(model_dir, eval_id) ensure_dir(eval_dir) info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed)) # Already done? info_file = op.join(eval_dir, 'info.json') if op.isfile(info_file) and not force: info('\tAlready done, skipping...') return # Oh well, a lot have been done up to here... rework somehow # Anytime we see this file, we know we need to stop stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD') #--------- #--------- Time to work! #--------- # Save model config joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3) # Read labelled data in info('Reading data...') X, y = rf_lab.Xy() info('ne=%d; nf=%d' % rf_lab.X().shape) # Save molids... a bit too ad-hoc... save_molids(data_dir, 'lab', rf_lab.ids()) if save_unlabelled_predictions: save_molids(data_dir, 'unl', rf_unl.ids()) save_molids(data_dir, 'scr', rf_scr.ids()) save_molids(data_dir, 'amb', rf_amb.ids()) # Save folding information. # By now, all the folds have already been computed: # - because we cached X # - and in this case we are warranted that no new unfolded features will appear at test time if folder is not None: info('Saving the map folded_features -> unfolded_feature...') folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5') if not op.isfile(folded2unfolded_file): with h5py.File(folded2unfolded_file) as h5: h5['f2u'] = folder.folded2unfolded() folder_light_file = op.join(data_dir, 'folder.pkl') if not op.isfile(folder_light_file): folder_light = copy(folder) # Shallow copy folder_light.clear_cache() joblib.dump(folder_light, folder_light_file, compress=3) # Cross-val splitter cver = cv_splits(num_points=len(y), Y=y, num_folds=num_cv_folds, rng=my_rng, stratify=True) # Fit and classify for cv_fold_num in xrange(num_cv_folds): fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num) if op.isfile(fold_info_file): info('Fold %d already done, skipping' % cv_fold_num) continue if op.isfile(stop_computing_file): info('Bad fold detected, no more computations required') break # Split into train/test train_i, test_i = cver(cv_fold_num) Xtrain, ytrain = X[train_i, :], y[train_i] Xtest, ytest = X[test_i, :], y[test_i] # Copy the model... model = clone(model_setup) start = time() info('Training...') model.fit(Xtrain, ytrain) train_time = time() - start info('Model fitting has taken %.2f seconds' % train_time) if save_fold_model: info('Saving trained model') joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3) info('Predicting and saving results...') with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5: start = time() # Test indices h5['test_indices'] = test_i # Model h5['logreg_coef'] = model.coef_ h5['logreg_intercept'] = model.intercept_ # Test examples info('Scoring test...') scores_test = model.predict_proba(Xtest) fold_auc = roc_auc_score(ytest, scores_test[:, 1]) fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05) info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc)) info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5)) h5['test'] = scores_test.astype(np.float32) if save_unlabelled_predictions: predict_malaria_unlabelled(model, h5, rf_amb=rf_amb, rf_scr=rf_scr, rf_unl=rf_unl, chunksize=chunksize) test_time = time() - start info('Predicting has taken %.2f seconds' % test_time) # Finally save meta-information for the fold metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('train_time', train_time), ('test_time', test_time), ('auc', fold_auc), ('enrichment5', fold_enrichment5), )) with open(fold_info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False) # One last thing, should we stop now? if fold_auc < min_fold_auc: stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \ (cv_fold_num, fold_auc, min_fold_auc) info(stop_message) with open(stop_computing_file, 'w') as writer: writer.write(stop_message) # Summarize cross-val in the info file metainfo = mlexp_info_helper( title='malaria-trees-oob', data_setup=data_id, model_setup=model_id, exp_function=giveupthefunc(), ) metainfo.update(( ('num_cv_folds', num_cv_folds), ('cv_seed', cv_seed), )) metainfo.update(logreg_params.items()) with open(info_file, 'w') as writer: json.dump(metainfo, writer, indent=2, sort_keys=False)
def roc_auc_score_mod(y_true, prob, sample_weight=None): return roc_auc_score(y_true, prob[:, 1], sample_weight=sample_weight)
nb.fit(train_dtm, y_train) y_pred = nb.predict(test_dtm) from sklearn.metrics import metrics print metrics.accuracy_score(y_test, y_pred) #92% Accuracy #Task 6 # Map five to 1 and 1 to 0 y_test[y_test ==1] = 0 y_test[y_test == 5 ] = 1 y_pred_prob = nb.predict_proba(test_dtm)[:,1] print metrics.roc_auc_score(y_test, y_pred_prob) #Task 7 import matplotlib.pyplot as plt fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate (1 - Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') #Task 8 print metrics.confusion_matrix(y_test, y_pred) sensitivity = 126 / float(25 + 126) specificity = 813/ float(813 + 58) #Task 9 false_positives = X_test[y_test < y_pred] # false positives