def __init__(self, calibrationTable, score_func=accuracy_score): """ calibrate a classifier @param calibrationTable: a pandas data frame """ print '--------- Calibrating Imputer -----------' X_cal, y_cal, _, _ = impute_risk_factors(calibrationTable) bestScore = -1 bestPipe = None bestParams = None for name, (pipe, params) in make_pipes().iteritems(): print '>'*10, name, '<'*10 _, cur_bestParams, cur_bestScore = fitClfWithGridSearch( name + '_risk', pipe, params, DatasetPair(np.array(X_cal), y_cal), saveToDir='/home/jj/code/Kaggle/allstate/output/gridSearchOutput', useJJ=True, score_func=score_func, n_jobs=N_JOBS, verbosity=0, minimize=False, cvSplitNum=5, maxLearningSteps=10, numConvergenceSteps=4, convergenceTolerance=0, eliteProportion=0.1, parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1, mutationStdDev=None, populationSize=6) if cur_bestScore > bestScore: bestScore = cur_bestScore bestPipe = clone(pipe) bestPipe.set_params(**cur_bestParams) bestParams = cur_bestParams print '----> best score:', bestScore pprint(bestParams) self._imputer = bestPipe
def fitClassifiers(trainData, useJJ, n_jobs=23, selectedClfs=None, overwriteSavedResult=True, verbose=True, cvSplitNum=10, test_size=0.25, random_states=[None], **fitArgs): """ fits a list of classifiers by searching for the best parameters using GridSearchCV @type trainData DatasetPair @param selectedClfs: which classifiers to fit. if None, fits all. @return: (a dictionary of {classifier name: classifier}, the best classifier) """ res = {} bestScore = 0 bestClf = () if selectedClfs and not isinstance(selectedClfs, Iterable): selectedClfs = [selectedClfs] intermediateResdir = os.path.join(rootdir, 'intermediate results') # ------ fit using gridsearchcv ----------- for name, v in classifiersToTry.iteritems(): if selectedClfs and name not in selectedClfs: continue pipe, paramsDict = makePipe([fillertoTry, normalizerToTry, (name, classifiersToTry[name])]) try: newpipe, bestParams, score = fitClfWithGridSearch(name, pipe, paramsDict, trainData, intermediateResdir, useJJ=useJJ, n_jobs=n_jobs, overwriteSavedResult=overwriteSavedResult, verbose=verbose, cvSplitNum=cvSplitNum, test_size=test_size, random_states=random_states, **fitArgs) cleanPipe = pipe.set_params(**bestParams) res[name] = cleanPipe # check if it's the best classifier if score > bestScore: bestScore = score bestClf = (name, cleanPipe, score) except Exception as e: print 'Fitting', name, 'caused an error:', e return res, bestClf
trainY = np.array(fullTrainY) data = DatasetPair(trainX, trainY) randomStates = [0, 1] # try multiple random states for better calibration popSize = 8 if simple else 15 cvObjs = [StratifiedShuffleSplit([0 if y == 0 else 1 for y in data.Y], n_iter=5, test_size=0.25, random_state=randomState) for randomState in randomStates] initPop = [[np.random.randint(len(v)) for v in params.items()] for _ in range(popSize)] dt = datetime.now() _, bestParams, score = fitClfWithGridSearch(name, pipe, params, data, saveToDir='/home/jj/code/Kaggle/Loan Default Prediction/output/gridSearchOutput', useJJ=True, score_func=mean_absolute_error, n_jobs=20, verbosity=3, minimize=True, cvObjs=cvObjs, maxLearningSteps=10, numConvergenceSteps=5, convergenceTolerance=0, eliteProportion=0.1, parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1, mutationStdDev=None, populationSize=popSize) bestPipe = clone(pipe) bestPipe.set_params(**bestParams) print 'CV Took', datetime.now() - dt # bestPipe = loadObject('/home/jj/code/Kaggle/Loan Default Prediction/output/gridSearchOutput/GBC_25fts_simple.pk')['best_estimator'] bestPipe.classification_metrics(trainX, trainY, n_iter=10) # ---------- learn the full training data dt = datetime.now() bestPipe.fit(fullTrainX, fullTrainY)
for col in outputTable_cal.columns: print '>'*20, col, '<'*20 cur_y = np.array(outputTable_cal[col]) bestScore = -1 bestPipe = None bestParams = None for name, (pipe, params) in make_pipes().iteritems(): print '>'*10, name, '<'*10 _, cur_bestParams, cur_bestScore = fitClfWithGridSearch( '_'.join([name, col, calibrationName]), pipe, params, DatasetPair(X_cal, cur_y), saveToDir='/home/jj/code/Kaggle/allstate/output/gridSearchOutput', useJJ=True, score_func=accuracy_score, n_jobs=N_JOBS, verbosity=0, minimize=False, cvSplitNum=5, maxLearningSteps=10, numConvergenceSteps=4, convergenceTolerance=0, eliteProportion=0.1, parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1, mutationStdDev=None, populationSize=6) if cur_bestScore > bestScore: bestScore = cur_bestScore bestPipe = clone(pipe) bestPipe.set_params(**cur_bestParams) bestParams = cur_bestParams indivClfs.append(bestPipe) print '---->', col, '<----', bestScore pprint(bestParams)