Esempio n. 1
0
    def __init__(self, calibrationTable, score_func=accuracy_score):
        """
        calibrate a classifier
        @param calibrationTable: a pandas data frame
        """

        print '--------- Calibrating Imputer -----------'
        X_cal, y_cal, _, _ = impute_risk_factors(calibrationTable)

        bestScore = -1
        bestPipe = None
        bestParams = None

        for name, (pipe, params) in make_pipes().iteritems():
            print '>'*10, name, '<'*10

            _, cur_bestParams, cur_bestScore = fitClfWithGridSearch(
                name + '_risk', pipe, params, DatasetPair(np.array(X_cal), y_cal),
                saveToDir='/home/jj/code/Kaggle/allstate/output/gridSearchOutput',
                useJJ=True, score_func=score_func, n_jobs=N_JOBS, verbosity=0,
                minimize=False, cvSplitNum=5,
                maxLearningSteps=10,
                numConvergenceSteps=4, convergenceTolerance=0, eliteProportion=0.1,
                parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1,
                mutationStdDev=None, populationSize=6)

            if cur_bestScore > bestScore:

                bestScore = cur_bestScore
                bestPipe = clone(pipe)
                bestPipe.set_params(**cur_bestParams)
                bestParams = cur_bestParams

        print '----> best score:', bestScore
        pprint(bestParams)

        self._imputer = bestPipe
Esempio n. 2
0
def fitClassifiers(trainData, useJJ, n_jobs=23, selectedClfs=None, overwriteSavedResult=True, verbose=True,
                   cvSplitNum=10, test_size=0.25, random_states=[None], **fitArgs):
    """ fits a list of classifiers by searching for the best parameters using GridSearchCV
    @type trainData DatasetPair
    @param selectedClfs: which classifiers to fit. if None, fits all.
    @return: (a dictionary of {classifier name: classifier}, the best classifier)
    """

    res = {}
    bestScore = 0
    bestClf = ()
    if selectedClfs and not isinstance(selectedClfs, Iterable): selectedClfs = [selectedClfs]
    intermediateResdir = os.path.join(rootdir, 'intermediate results')

    # ------ fit using gridsearchcv -----------
    for name, v in classifiersToTry.iteritems():
        if selectedClfs and name not in selectedClfs: continue

        pipe, paramsDict = makePipe([fillertoTry, normalizerToTry, (name, classifiersToTry[name])])

        try:
            newpipe, bestParams, score = fitClfWithGridSearch(name, pipe, paramsDict, trainData, intermediateResdir, useJJ=useJJ,
                                                              n_jobs=n_jobs, overwriteSavedResult=overwriteSavedResult, verbose=verbose,
                                                              cvSplitNum=cvSplitNum, test_size=test_size, random_states=random_states, **fitArgs)

            cleanPipe = pipe.set_params(**bestParams)
            res[name] = cleanPipe

            # check if it's the best classifier
            if score > bestScore:
                bestScore = score
                bestClf = (name, cleanPipe, score)

        except Exception as e:
            print 'Fitting', name, 'caused an error:', e

    return res, bestClf
Esempio n. 3
0
        trainY = np.array(fullTrainY)
    data = DatasetPair(trainX, trainY)

    randomStates = [0, 1]       # try multiple random states for better calibration
    popSize = 8 if simple else 15

    cvObjs = [StratifiedShuffleSplit([0 if y == 0 else 1 for y in data.Y], n_iter=5, test_size=0.25,
                                   random_state=randomState) for randomState in randomStates]
    initPop = [[np.random.randint(len(v)) for v in params.items()] for _ in range(popSize)]

    dt = datetime.now()

    _, bestParams, score = fitClfWithGridSearch(name, pipe, params, data,
                                                saveToDir='/home/jj/code/Kaggle/Loan Default Prediction/output/gridSearchOutput',
                                                useJJ=True, score_func=mean_absolute_error, n_jobs=20, verbosity=3,
                                                minimize=True, cvObjs=cvObjs, maxLearningSteps=10,
                                                numConvergenceSteps=5, convergenceTolerance=0, eliteProportion=0.1,
                                                parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1,
                                                mutationStdDev=None, populationSize=popSize)

    bestPipe = clone(pipe)
    bestPipe.set_params(**bestParams)

    print 'CV Took', datetime.now() - dt

    # bestPipe = loadObject('/home/jj/code/Kaggle/Loan Default Prediction/output/gridSearchOutput/GBC_25fts_simple.pk')['best_estimator']

    bestPipe.classification_metrics(trainX, trainY, n_iter=10)
    # ---------- learn the full training data
    dt = datetime.now()
    bestPipe.fit(fullTrainX, fullTrainY)
Esempio n. 4
0
for col in outputTable_cal.columns:
    print '>'*20, col, '<'*20
    cur_y = np.array(outputTable_cal[col])

    bestScore = -1
    bestPipe = None
    bestParams = None

    for name, (pipe, params) in make_pipes().iteritems():
        print '>'*10, name, '<'*10

        _, cur_bestParams, cur_bestScore = fitClfWithGridSearch(
            '_'.join([name, col, calibrationName]), pipe, params, DatasetPair(X_cal, cur_y),
            saveToDir='/home/jj/code/Kaggle/allstate/output/gridSearchOutput',
            useJJ=True, score_func=accuracy_score, n_jobs=N_JOBS, verbosity=0,
            minimize=False, cvSplitNum=5,
            maxLearningSteps=10,
            numConvergenceSteps=4, convergenceTolerance=0, eliteProportion=0.1,
            parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1,
            mutationStdDev=None, populationSize=6)

        if cur_bestScore > bestScore:

            bestScore = cur_bestScore
            bestPipe = clone(pipe)
            bestPipe.set_params(**cur_bestParams)
            bestParams = cur_bestParams

    indivClfs.append(bestPipe)
    print '---->', col, '<----', bestScore
    pprint(bestParams)