Python makePipe Examples, Kaggle.utilities.makePipe Python Examples

Example #1

0

Show file

File: pipes.py Project: jennyyuejin/Kaggle

def regressorPipes(simple, usePCA = True, useRF = True):
    """
    the regressor pipes
    @param simple: whether a small range of parameters are desired
    @return: pipe, params
    """

    pcaReducerToTry_reg = ('PCAer', (PcaEr(total_var=0.85), {'whiten': [True, False]})) if simple else \
        ('PCAer', (PcaEr(total_var=0.999), {'total_var': [0.85, 0.9], 'whiten': [True, False]}))

    rfReducerToTry_reg = ('RFer', (RandomForester(n_estimators=25, num_features=99), {'num_features':[15, 25]})) if simple else \
        ('RFer', (RandomForester(num_features=999, n_estimators=999), {'num_features':[0.5, 15, 25], 'n_estimators':[25, 50]}))

    regressorToTry = ('GBR', (GradientBoostingRegressor(loss='lad', max_features='auto', learning_rate=0.1, n_estimators=5, subsample=0.7),
                              {'learning_rate': [0.1, 0.5]})) if simple else \
        ('GBR', (GradientBoostingRegressor(loss='lad'),
                 {'max_features': ['auto', 'sqrt', 'log2'], 'subsample': [0.7, 0.85, 1], 'learning_rate': [0.01, 0.1, 0.5, 1],
                  'max_depth': [3, 5, 7], 'n_estimators': [5, 10, 25, 50, 100]}))

    if usePCA and useRF:
        return makePipe([pcaReducerToTry_reg, rfReducerToTry_reg, regressorToTry])
    elif usePCA and not useRF:
        return makePipe([pcaReducerToTry_reg, regressorToTry])
    elif not usePCA and useRF:
        return makePipe([rfReducerToTry_reg, regressorToTry])
    else:
        return makePipe([regressorToTry])

Example #2

0

Show file

File: pipes.py Project: jennyyuejin/Kaggle

def make_pipes():
    """ makes a bunch of single-classifier pipes with corresponding pipes for calibration purposes
    """

    return {
        'GBC': makePipe([('GBC', (GradientBoostingClassifier(),
                                  {'learning_rate': [0.01, 0.1, 0.5, 1],
                                   'n_estimators': [5, 10, 25, 50, 100],
                                   'subsample': [0.7, 0.85, 1]}))]),

        'RF': makePipe([('RF', (RandomForestClassifier(n_jobs = N_JOBS),
                                {'n_estimators': [5, 10, 25, 50, 100],
                                 'max_features': [3, 0.7, 'auto', 'log2', None]
                                }))]),

        'SVC': makePipe([('SVC', (SVC(),
                                  {'C': [0.01, 0.1, 0.25, 0.5, 1],
                                   'kernel': ['rbf', 'linear', 'poly', 'rbf', 'sigmoid'],
                                   'gamma': [0, 1, 10],
                                   'shrinking': [True, False],
                                   'tol': [1e-5, 1e-3, 0.1]
                                  }))]),

        'LR': makePipe([('LR', (LogisticRegression(),
                                {'penalty': ['l1', 'l2'],
                                 'C': [0.01, 0.1, 0.5, 1, 3, 10]
                                }))]),

        # 'SGD': makePipe([('SGD', (SGDClassifier(n_jobs=20),
        #                           {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        #                            'alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.1],
        #                            'learning_rate': ['constant', 'optimal', 'invscaling']
        #                           }))])
    }

Example #3

0

Show file

File: pipes.py Project: jennyyuejin/Kaggle

def prepPipes(simple, useImputer=True, useNormalizer=True):
    """
    the preprocessing pipes
    @param simple: whether a small range of parameters are desired
    @return: pipe, params
    """

    imputerToTry = ('filler', (Imputer(strategy='mean'), {})) if simple else \
        ('filler', (Imputer(), {'strategy': ['mean', 'median', 'most_frequent']}))

    normalizerToTry = ('normalizer', (Normalizer(), {})) if simple else \
        ('normalizer', (Normalizer(), {'method': ['standardize', 'rescale']}))

    if useImputer and useNormalizer:
        return makePipe([imputerToTry, normalizerToTry])
    elif useImputer and not useNormalizer:
        return makePipe([imputerToTry])
    elif not useImputer and useNormalizer:
        makePipe([normalizerToTry])
    else:
        raise Exception('Have to use at least one of imputer and normalizer.')

Example #4

0

Show file

File: pipes.py Project: jennyyuejin/Kaggle

def classifierPipes(simple, name, usePCA = True, useRF = True):
    """
    the classification pipes
    @param simple: whether a small range of parameters are desired
    @param name: which classifier to use. one of {'GBC', 'logistic'}
    @return: pipe, params
    """

    pcaReducerToTry_class = ('PCAer', (PcaEr(total_var=0.85), {'whiten': [True, False]})) if simple else \
        ('PCAer', (PcaEr(total_var=0.999), {'total_var': [0.85, 0.9], 'whiten': [True, False]}))

    rfReducerToTry_class = ('RFer', (RandomForester(n_estimators=25, num_features=99), {'num_features':[15, 25]})) if simple else \
        ('RFer', (RandomForester(num_features=999, n_estimators=999), {'num_features':[0.5, 15, 25], 'n_estimators':[25, 50]}))

    if name == 'GBC':
        classifierToTry = ('GBC', (GradientBoostingClassifier(subsample=0.7, n_estimators=25),
                                   {'learning_rate': [0.1, 0.5]})) if simple else \
            ('GBC', (GradientBoostingClassifier(),
                     {'max_features': ['auto', 'sqrt', 'log2'], 'learning_rate': [0.01, 0.1, 0.5, 1],
                      'n_estimators': [5, 10, 25, 50, 100], 'subsample': [0.7, 0.85, 1], 'max_depth': [3, 5, 7]}))
    elif name == 'logistic':
        classifierToTry = ('logistic', (LogisticRegression(penalty='l2', C=1.0),
                                        {'tol': [0.01, 0.0001]} if simple else
                                        {'penalty': ['l1', 'l2'], 'C': [0.001, 0.1, 0.25, 0.5, 0.7, 1.0], 'tol': [0.01, 0.001, 0.0001]}
        ))

    else:
        raise Exception('Classifier %s is not supported.' % name)

    if usePCA and useRF:
        return makePipe([pcaReducerToTry_class, rfReducerToTry_class, classifierToTry])
    elif usePCA and not useRF:
        return makePipe([pcaReducerToTry_class, classifierToTry])
    elif not usePCA and useRF:
        return makePipe([rfReducerToTry_class, classifierToTry])
    else:
        return makePipe([classifierToTry])

Example #5

0

Show file

File: titanicutilities.py Project: jennyyuejin/Kaggle

def fitClassifiers(trainData, useJJ, n_jobs=23, selectedClfs=None, overwriteSavedResult=True, verbose=True,
                   cvSplitNum=10, test_size=0.25, random_states=[None], **fitArgs):
    """ fits a list of classifiers by searching for the best parameters using GridSearchCV
    @type trainData DatasetPair
    @param selectedClfs: which classifiers to fit. if None, fits all.
    @return: (a dictionary of {classifier name: classifier}, the best classifier)
    """

    res = {}
    bestScore = 0
    bestClf = ()
    if selectedClfs and not isinstance(selectedClfs, Iterable): selectedClfs = [selectedClfs]
    intermediateResdir = os.path.join(rootdir, 'intermediate results')

    # ------ fit using gridsearchcv -----------
    for name, v in classifiersToTry.iteritems():
        if selectedClfs and name not in selectedClfs: continue

        pipe, paramsDict = makePipe([fillertoTry, normalizerToTry, (name, classifiersToTry[name])])

        try:
            newpipe, bestParams, score = fitClfWithGridSearch(name, pipe, paramsDict, trainData, intermediateResdir, useJJ=useJJ,
                                                              n_jobs=n_jobs, overwriteSavedResult=overwriteSavedResult, verbose=verbose,
                                                              cvSplitNum=cvSplitNum, test_size=test_size, random_states=random_states, **fitArgs)

            cleanPipe = pipe.set_params(**bestParams)
            res[name] = cleanPipe

            # check if it's the best classifier
            if score > bestScore:
                bestScore = score
                bestClf = (name, cleanPipe, score)

        except Exception as e:
            print 'Fitting', name, 'caused an error:', e

    return res, bestClf