def gbjjInnerLoop(learner): """ @param learner: @return: cv score of the learner """ global trainX, trainY, lf, n_jobs, cvObj l = clone(learner) scores = jjcross_val_score(l, trainX, trainY, score_func=lf, n_jobs=n_jobs, cv=cvObj) return scores.mean()
def _fit_stage(self, X, y, rmTolerance): """ fits one stage of gradient boosting @param X: @param y: @param rmTolerance: tolerance for 1D optimization @return: nothing """ residuals = self.lossFunction.negative_gradient(y, self._currentPrediction) trainX, trainY, _, _ = splitTrainTest(X, residuals, 1-self.subsample) # stochastic boosting. train only on a portion of the data if len(np.unique(trainY))==1: hm = MajorityPredictor().fit(trainY) else: cvObj = KFold(n=len(trainX), n_folds=self.cvNumFolds, indices=False, shuffle=True, random_state=self.randomState) # find the h that best mimics the negative gradient if self.n_jobs > 1: # parallel n_jobs = max(1, self.n_jobs/len(self.learners), self.cvNumFolds) # n_jobs = 1 pool = MyPool(processes=self.n_jobs, initializer=gbjjInit, initargs=(trainX, trainY, self.lossFunction, n_jobs, cvObj)) temp = pool.map_async(gbjjInnerLoop, self.learners) temp.wait() h_res = temp.get() pool.close() pool.join() else: # single thread h_res = [] for learner in self.learners: if self.verbosity >= 2: print 'Fitting learner:', learner l = clone(learner) scores = jjcross_val_score(l, trainX, trainY, score_func=self.lossFunction, n_jobs=1, cv=cvObj) h_res.append(scores.mean()) hm = clone(self.learners[np.argsort(h_res)[0]]) if self.verbosity>=1: print "The best classifier is", hm.__class__ # find rm hm.fit(trainX, trainY) hmx = hm.predict(X) rm = minimize_scalar(lambda r: self.lossFunction(y, self._currentPrediction + r*hmx), tol=rmTolerance).x # append estimator and weight self._estimators.append((hm, rm))