コード例 #1
0
    def _fit_stage(self, X, y, rmTolerance):
        """
        fits one stage of gradient boosting
        @param X:
        @param y:
        @param rmTolerance: tolerance for 1D optimization
        @return: nothing
        """

        residuals = self.lossFunction.negative_gradient(y, self._currentPrediction)
        trainX, trainY, _, _ = splitTrainTest(X, residuals, 1-self.subsample)   # stochastic boosting. train only on a portion of the data

        if len(np.unique(trainY))==1:
            hm = MajorityPredictor().fit(trainY)
        else:
            cvObj = KFold(n=len(trainX), n_folds=self.cvNumFolds, indices=False, shuffle=True, random_state=self.randomState)

            # find the h that best mimics the negative gradient
            if self.n_jobs > 1:  # parallel
                n_jobs = max(1, self.n_jobs/len(self.learners), self.cvNumFolds)
                # n_jobs = 1
                pool = MyPool(processes=self.n_jobs, initializer=gbjjInit, initargs=(trainX, trainY, self.lossFunction, n_jobs, cvObj))
                temp = pool.map_async(gbjjInnerLoop, self.learners)
                temp.wait()
                h_res = temp.get()
                pool.close()
                pool.join()

            else:   # single thread
                h_res = []

                for learner in self.learners:
                    if self.verbosity >= 2:
                        print 'Fitting learner:', learner
                    l = clone(learner)
                    scores = jjcross_val_score(l, trainX, trainY, score_func=self.lossFunction, n_jobs=1, cv=cvObj)
                    h_res.append(scores.mean())

            hm = clone(self.learners[np.argsort(h_res)[0]])

        if self.verbosity>=1:
            print "The best classifier is", hm.__class__

        # find rm
        hm.fit(trainX, trainY)
        hmx = hm.predict(X)
        rm = minimize_scalar(lambda r: self.lossFunction(y, self._currentPrediction + r*hmx), tol=rmTolerance).x

        # append estimator and weight
        self._estimators.append((hm, rm))
コード例 #2
0
ファイル: utilities.py プロジェクト: jennyyuejin/Kaggle
def jjcross_val_score(clf, X, y, score_func, cv, y_test=None, n_jobs=cpu_count(), use_predProb_instead=False,
                      fit_params=None, weights=None, verbose=True):
    """

    @param clf:
    @param X: np.array
    @param y: np.array
    @param y_test: np.array. If not None then the Y's used for testing are different from the ones used for training.
    @param score_func: a score function of the form func(y_true, y_pred)
    @param cv: either an integer indicating the number of StratifiedKFold folds, or an iterable
    @param n_jobs:
    @param fit_params: parameters to pass to the estimator's fit method
    @param socre_params: parameters to pass to score_func
    @return: array of scores
    """

    cv = check_cv(cv, X, y, classifier=is_classifier(clf))

    # print 'cv:', cv
    fit_params = fit_params if fit_params is not None else {}
    # weights = weights if weights is not None else {}

    if n_jobs > 1:
        # figure out the number of folds
        n_jobs = min(n_jobs, getNumCvFolds(cv))
        # print 'jjcvscore with %d proceses' % n_jobs
        pool = MyPool(n_jobs, initializer=jjcross_val_score_init,
                      initargs=(X, y, clf, score_func, fit_params, weights, y_test, use_predProb_instead))
        data = [[trainInds, testInds] for trainInds, testInds in cv]
        temp = pool.map_async(jjcross_val_score_inner, data)
        temp.wait()
        scores = temp.get()
        pool.close()
        pool.join()
    else:
        # print 'jjcvscore single thread'
        scores = []
        fold = 1
        for trainInds, testInds in cv:
            # print '=========== fold %d ===========' % fold

            trainX = X[trainInds]
            trainY = y[trainInds]
            testX = X[testInds]
            testY = (y if y_test is None else y_test)[testInds]

            if weights is not None:
                trainWeights = weights[trainInds]
                testWeights = weights[testInds]

            if len(np.unique(trainY))==1:
                yPred = np.repeat(trainY[0], len(testY))
            else:
                clonedClf = clone(clf)

                if weights is not None and 'sample_weight' in clonedClf.fit.func_code.co_varnames:
                    try:
                        clonedClf.fit(trainX, trainY, sample_weight=trainWeights, **fit_params)
                    except:
                        clonedClf.fit(trainX, trainY, **fit_params)
                else:
                    clonedClf.fit(trainX, trainY, **fit_params)

                yPred =  clonedClf.predict_proba(testX)[:, 0] if use_predProb_instead else clonedClf.predict(testX)

            if weights is None:
                score = score_func(testY, yPred)
            else:
                score = score_func(testY, yPred, sample_weight=testWeights)

            scores.append(score)
            fold += 1


    if verbose:
        for i, score in enumerate(scores):
            print 'Fold %d, score = %f' % (i, score)

        print ">>>>>>>> %d-fold Score (mean, cv) = (%f, %f)" % (len(cv), np.mean(scores), np.std(scores)/np.mean(scores))

    return np.array(scores)