Exemple #1
0
def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True):
    X, y, sample_weight = generate_classification_data(distance=0.6)

    assert classifier == classifier.fit(X, y, sample_weight=sample_weight)
    assert list(classifier.features) == list(X.columns)

    check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp,
                has_importances=has_importances)

    def mean_vote(x):
        return numpy.mean(x, axis=0)

    labels = classifier.predict(X, mean_vote)
    proba = classifier.predict_proba(X, mean_vote)
    assert numpy.all(proba == classifier.predict_proba(X, mean_vote))

    score = accuracy_score(y, labels)
    print(score)
    assert score > 0.7
    assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1'
    assert numpy.all(proba >= 0.), 'negative probabilities'

    auc_score = roc_auc_score(y, proba[:, 1])
    print(auc_score)
    assert auc_score > 0.8
    if has_staged_pp:
        for p in classifier.staged_predict_proba(X, mean_vote):
            assert p.shape == (len(X), 2)
        # checking that last iteration coincides with previous
        assert numpy.all(p == proba)
def test_quality(n_samples=3000):
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)

    params = {
        'n_neighbors': 10,
        'n_estimators': 10,
        'uniform_variables': ['column0'],
        'base_estimator':
            DecisionTreeClassifier(min_samples_leaf=20, max_depth=5)
    }

    for algorithm in ['SAMME', 'SAMME.R']:
        uboost_classifier = uBoostClassifier(
            algorithm=algorithm, efficiency_steps=5, **params)

        bdt_classifier = uBoostBDT(algorithm=algorithm, **params)

        for classifier in [bdt_classifier, uboost_classifier]:
            classifier.fit(trainX, trainY)
            predict_proba = classifier.predict_proba(testX)
            predict = classifier.predict(testX)
            assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \
                "quality is awful"
            print("Accuracy = %.3f" % accuracy_score(testY, predict))
def test_categorical_gb(n_samples=100000, n_features=10, p=0.7):
    y = numpy.random.random(n_samples) > 0.5
    X = numpy.random.randint(40, size=[n_samples, n_features]) * 2
    X += numpy.random.random(size=[n_samples, n_features]) > p
    X += y[:, numpy.newaxis]

    from sklearn.cross_validation import train_test_split

    trainX, testX, trainY, testY = train_test_split(X, y)
    boosters = {
        'old': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5),
        'cat': CommonGradientBoosting(loss=AdaLossFunction(), subsample=0.5, dtype=int,
            base_estimator=CategoricalTreeRegressor()),
        'cat2': TreeGradientBoostingClassifier(loss=BinomialDeviance(), dtype='int', update_tree=False,
            base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=3, method='cv')),
        'cat3': TreeGradientBoostingClassifier(loss=BinomialDeviance(), dtype='int', update_tree=False,
            base_estimator=ObliviousCategoricalRegressor(n_features=10, n_categories_power=5, splits=1, pfactor=0.5)),
        'cat2-2': TreeGradientBoostingClassifier(loss=BinomialDeviance(), dtype='int', update_tree=False, n_threads=2,
            base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=1)),
        'cat-linear': CategoricalLinearClassifier(),
    }
    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
Exemple #4
0
def test_factory():
    factory = ClassifiersFactory()
    try:
        from rep.estimators.tmva import TMVAClassifier
        factory.add_classifier('tmva', TMVAClassifier())
    except ImportError:
        pass
    factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
    factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4')
    for cl in factory.values():
        assert list(cl.features) == list(X.columns)
    proba = factory.predict_proba(X, parallel_profile='threads-4')
    labels = factory.predict(X, parallel_profile='threads-4')
    for key, val in labels.items():
        score = accuracy_score(y, val)
        print(key, score)
        assert score > 0.7, key

    for key, val in proba.items():
        assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
        assert numpy.all(val >= 0.), 'negative probabilities'

        auc_score = roc_auc_score(y, val[:, 1])
        print(auc_score)
        assert auc_score > 0.8

    for key, iterator in factory.staged_predict_proba(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), 2)

        # checking that last iteration coincides with previous
        assert numpy.all(p == proba[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict_proba(X)
    probs2 = clf_loaded.predict_proba(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    val = numpy.mean(X['column0'])
    check_report_with_mask(report, "column0 > %f" % (val / 2.), X)
    check_report_with_mask(report, lambda x: numpy.array(x['column0']) < val * 2., X)
    check_report_with_mask(report, None, X)
def test_workability(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    for booster in [FoldingGBClassifier, TreeGradientBoostingClassifier]:
        for loss in [BinomialDeviance(), AdaLossFunction()]:
            for update in [True, False]:
                for base in [FastTreeRegressor(max_depth=3), FastNeuroTreeRegressor(max_depth=3)]:
                    if numpy.random.random() > 0.7:
                        clf = booster(loss=loss, n_estimators=100,
                                      base_estimator=base, update_tree=update)
                        clf.fit(trainX, trainY)
                        auc = roc_auc_score(testY, clf.predict_proba(testX)[:, 1])
                        print('booster', booster, loss, 'update=', update, ' base=', base.__class__,
                              ' quality=', auc)
                        assert auc > 0.8
Exemple #6
0
 def generate_split_result(model_config, X, y, split_id, splitter, cvgroup, reraise=False):
     # Splitting
     train_indices, trest_indices = splitter.split()
     fold_group = cvgroup.require_group(split_id)
     try:
         fold_group.attrs['config'] = splitter.what().id()
         fold_group.create_dataset('test_indices', data=trest_indices, compression=compression)  # uncompressible
         fold_group.create_dataset('y_test', data=y[trest_indices], compression=compression)
     except:
         pass  # Dodgy
     # Model configuration
     model_group = fold_group.require_group('model=%s' % model_config.nickname)
     try:
         # already done?
         if 'DONE' in fold_group.keys():
             print '%s already done, skipping...' % model_group.name
             return
         if 'FAILED' in fold_group.keys():
             print '%s already failed, skipping...' % model_group.name
             return
         # compute the result
         scores, model, train_time, test_time = \
             train_test(model_config.seed_model(expid), X, y, train_indices, trest_indices)
         # save scores, auc, times
         try:
             model_group.attrs['auc'] = roc_auc_score(y[trest_indices], scores)
         except:
             model_group.attrs['auc'] = None
         model_group.attrs['train_time'] = train_time
         model_group.attrs['test_time'] = test_time
         model_group.create_dataset('test_scores', data=scores, compression=compression)
         # save whatever from the model
         model_config.storer.to_hdf5(model, model_group, compression=compression)
         # done
         model_group['DONE'] = 'Finished on %s' % strftime("%c")
     except Exception:
         model_group['FAILED'] = format_exc()
         if reraise:
             raise
def test_gb_quality(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    # Multiplying by random matrix
    multiplier = numpy.random.normal(size=[n_features, n_features])
    shift = numpy.random.normal(size=[1, n_features]) * 5
    trainX = numpy.dot(trainX.values, multiplier) + shift
    testX = numpy.dot(testX.values, multiplier) + shift

    boosters = {
        'old_boost': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5, subsample=0.3),
        'fast+old_tree': CommonGradientBoosting(n_estimators=100,
            base_estimator=DecisionTreeRegressor(min_samples_split=50, max_depth=5)),
        'fast+neuro': TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
                                                     base_estimator=FastNeuroTreeRegressor()),
        'fold+tree': FoldingGBClassifier(loss=BinomialDeviance(), n_estimators=10, update_tree=True,
                                         base_estimator=FastNeuroTreeRegressor()),
        'ugb': uGradientBoostingClassifier(loss=AdaLossFunction(),
            n_estimators=100, min_samples_split=50, max_depth=5, update_tree=True, subsample=0.3)
    }

    for criterion in ['mse', # 'fmse', # 'pvalue',
                      # 'significance',
                      'significance2',
                      # 'gini',
                      'entropy',
                      'poisson'
    ]:
        boosters['fast-' + criterion[:4]] = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
            base_estimator=FastTreeRegressor(criterion=criterion))

    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
def test_refitting(n_samples=10000, n_features=10, distance=0.5):
    trainX, trainY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    testX, testY = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    booster = TreeGradientBoostingClassifier(n_estimators=100, update_tree=True,
                                             base_estimator=FastTreeRegressor())
    booster.fit(trainX, trainY)
    print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1]))
    print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))

    booster.refit_trees(trainX, trainY)
    print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1]))
    print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))


    booster.refit_trees(testX, testY)
    print(roc_auc_score(testY, booster.predict_proba(testX)[:, 1]))
    print(roc_auc_score(trainY, booster.predict_proba(trainX)[:, 1]))
Exemple #9
0
def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT,
                # Logreg params
                logreg_penalty='l1',
                logreg_C=1.0,
                logreg_class_weight_auto=False,
                logreg_dual=False,
                logreg_tol=1e-4,
                logreg_fit_intercept=True,
                logreg_intercept_scaling=1,
                # CV params
                num_cv_folds=10,
                cv_seeds=(0,),
                save_unlabelled_predictions=False,
                save_fold_model=False,
                min_fold_auc=0.88,
                # Fingerprint folding params
                fingerprint_folder_seed=0,
                fingerprint_fold_size=1023,
                # Computational requirements params
                force=False,
                chunksize=1000000):
    """Logistic regression experiment using the liblinear wrapper in sklearn.
    Generates cross-val results
    """

    ### TODO Remove
    if logreg_tol < 1E-5:
        info('Ignoring long intolerant experiments')
        return

    info('Malaria logregs experiment')

    # Command line type inference is rotten...
    logreg_C = float(logreg_C)
    logreg_tol = float(logreg_tol)
    logreg_intercept_scaling = float(logreg_intercept_scaling)
    num_cv_folds = int(num_cv_folds)
    min_fold_auc = float(min_fold_auc)
    fingerprint_folder_seed = int(fingerprint_folder_seed)
    fingerprint_fold_size = int(fingerprint_fold_size)
    chunksize = int(chunksize)

    # Example providers
    folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed,
                                                                 fold_size=fingerprint_fold_size)
    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder)
    info('Data description: %s' % rf_lab.configuration().id(full=True))

    # Experiment context: data
    data_id = rf_lab.configuration().id(full=True)
    data_dir = op.join(dest_dir, data_id)
    ensure_dir(data_dir)

    for cv_seed in cv_seeds:

        # Command line type inference is rotten...
        cv_seed = int(cv_seed)

        # Deterministic randomness
        my_rng = np.random.RandomState(seed=cv_seed)

        # Experiment context: model
        logreg_params = OrderedDict((
            ('penalty', logreg_penalty),
            ('C', logreg_C),
            ('class_weight', 'auto' if logreg_class_weight_auto else None),
            ('dual', logreg_dual),
            ('tol', logreg_tol),
            ('fit_intercept', logreg_fit_intercept),
            ('intercept_scaling', logreg_intercept_scaling),
            ('random_state', my_rng.randint(low=0, high=1000 ** 4)),
        ))
        model_setup = LogisticRegression(**logreg_params)
        model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()])
        model_dir = op.join(data_dir, model_id)
        ensure_dir(model_dir)
        info('Model: %s' % model_id)

        # Experiment context: eval
        eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds)
        eval_dir = op.join(model_dir, eval_id)
        ensure_dir(eval_dir)
        info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed))

        # Already done?
        info_file = op.join(eval_dir, 'info.json')
        if op.isfile(info_file) and not force:
            info('\tAlready done, skipping...')
            return  # Oh well, a lot have been done up to here... rework somehow

        # Anytime we see this file, we know we need to stop
        stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD')

        #---------
        #--------- Time to work!
        #---------

        # Save model config
        joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3)

        # Read labelled data in
        info('Reading data...')
        X, y = rf_lab.Xy()
        info('ne=%d; nf=%d' % rf_lab.X().shape)

        # Save molids... a bit too ad-hoc...
        save_molids(data_dir, 'lab', rf_lab.ids())
        if save_unlabelled_predictions:
            save_molids(data_dir, 'unl', rf_unl.ids())
            save_molids(data_dir, 'scr', rf_scr.ids())
            save_molids(data_dir, 'amb', rf_amb.ids())

        # Save folding information.
        # By now, all the folds have already been computed:
        #   - because we cached X
        #   - and in this case we are warranted that no new unfolded features will appear at test time
        if folder is not None:
            info('Saving the map folded_features -> unfolded_feature...')
            folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5')
            if not op.isfile(folded2unfolded_file):
                with h5py.File(folded2unfolded_file) as h5:
                    h5['f2u'] = folder.folded2unfolded()
            folder_light_file = op.join(data_dir, 'folder.pkl')
            if not op.isfile(folder_light_file):
                folder_light = copy(folder)  # Shallow copy
                folder_light.clear_cache()
                joblib.dump(folder_light, folder_light_file, compress=3)

        # Cross-val splitter
        cver = cv_splits(num_points=len(y),
                         Y=y,
                         num_folds=num_cv_folds,
                         rng=my_rng,
                         stratify=True)

        # Fit and classify
        for cv_fold_num in xrange(num_cv_folds):

            fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num)
            if op.isfile(fold_info_file):
                info('Fold %d already done, skipping' % cv_fold_num)
                continue

            if op.isfile(stop_computing_file):
                info('Bad fold detected, no more computations required')
                break

            # Split into train/test
            train_i, test_i = cver(cv_fold_num)
            Xtrain, ytrain = X[train_i, :], y[train_i]
            Xtest, ytest = X[test_i, :], y[test_i]

            # Copy the model...
            model = clone(model_setup)

            start = time()
            info('Training...')
            model.fit(Xtrain, ytrain)
            train_time = time() - start
            info('Model fitting has taken %.2f seconds' % train_time)

            if save_fold_model:
                info('Saving trained model')
                joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3)

            info('Predicting and saving results...')
            with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5:

                start = time()

                # Test indices
                h5['test_indices'] = test_i

                # Model
                h5['logreg_coef'] = model.coef_
                h5['logreg_intercept'] = model.intercept_

                # Test examples
                info('Scoring test...')
                scores_test = model.predict_proba(Xtest)
                fold_auc = roc_auc_score(ytest, scores_test[:, 1])
                fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05)
                info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc))
                info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5))
                h5['test'] = scores_test.astype(np.float32)

                if save_unlabelled_predictions:
                    predict_malaria_unlabelled(model,
                                               h5,
                                               rf_amb=rf_amb,
                                               rf_scr=rf_scr,
                                               rf_unl=rf_unl,
                                               chunksize=chunksize)

                test_time = time() - start
                info('Predicting has taken %.2f seconds' % test_time)

                # Finally save meta-information for the fold
                metainfo = mlexp_info_helper(
                    title='malaria-trees-oob',
                    data_setup=data_id,
                    model_setup=model_id,
                    exp_function=giveupthefunc(),
                )
                metainfo.update((
                    ('train_time', train_time),
                    ('test_time', test_time),
                    ('auc', fold_auc),
                    ('enrichment5', fold_enrichment5),
                ))
                with open(fold_info_file, 'w') as writer:
                    json.dump(metainfo, writer, indent=2, sort_keys=False)

                # One last thing, should we stop now?
                if fold_auc < min_fold_auc:
                    stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \
                                   (cv_fold_num, fold_auc, min_fold_auc)
                    info(stop_message)
                    with open(stop_computing_file, 'w') as writer:
                        writer.write(stop_message)

        # Summarize cross-val in the info file
        metainfo = mlexp_info_helper(
            title='malaria-trees-oob',
            data_setup=data_id,
            model_setup=model_id,
            exp_function=giveupthefunc(),
        )
        metainfo.update((
            ('num_cv_folds', num_cv_folds),
            ('cv_seed', cv_seed),
        ))
        metainfo.update(logreg_params.items())
        with open(info_file, 'w') as writer:
            json.dump(metainfo, writer, indent=2, sort_keys=False)
Exemple #10
0
def roc_auc_score_mod(y_true, prob, sample_weight=None):
    return roc_auc_score(y_true, prob[:, 1], sample_weight=sample_weight)
nb.fit(train_dtm, y_train)

y_pred = nb.predict(test_dtm)

from sklearn.metrics import metrics

print metrics.accuracy_score(y_test, y_pred)
#92% Accuracy 
#Task 6 
# Map five to 1 and 1 to 0 
y_test[y_test ==1]  = 0
y_test[y_test == 5 ] = 1


y_pred_prob = nb.predict_proba(test_dtm)[:,1]
print metrics.roc_auc_score(y_test, y_pred_prob)
#Task 7
import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')

#Task 8
print metrics.confusion_matrix(y_test, y_pred)
sensitivity = 126 / float(25 + 126)
specificity = 813/ float(813 + 58)
#Task 9 
false_positives = X_test[y_test < y_pred] # false positives