コード例 #1
0
ファイル: main.py プロジェクト: jennyyuejin/Kaggle
def validate_classifier(clf, X, Y_train, Y_validate, cv, use_predProb_instead, scoreFunc=roc_auc_score, n_jobs=16, test_size=0.25):
    """
    @param cv: an object (list of (trainInds, testInds)) or an integer (number of folds)
    @return: list of cv scores
    """

    cvObj = StratifiedShuffleSplit(Y_validate, n_iter=cv, test_size=test_size) if isinstance(cv, int) else cv

    scores = jjcross_val_score(clf, X, Y_train, scoreFunc, cvObj, Y_validate, n_jobs=n_jobs, use_predProb_instead=use_predProb_instead)

    return scores
コード例 #2
0
ファイル: utilities.py プロジェクト: jennyyuejin/Kaggle
def gridSearch(clf, cvOutputFname, x_train, y_train, weights, innerclf=False, num_folds = 10):
    """

    :param clf:
    :param cvOutputFname:
    :param x_train:
    :param y_train:
    :param weights:
    :param innerclf: if true, the clf parameter is an "outter" object that wraps a "regressor" attribute of type Ridge;
                    if false, clf is the Ridge itself
    :param num_folds:
    :return:
    """
    print '================== Grid Search for the Best Parameter  =================='

    cvOutputFile = open(cvOutputFname, 'w')
    res = {}
    cvObj = KFold(len(y_train), n_folds=num_folds, shuffle=True, random_state=0)

    for tolerance in [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.5]:
        for alpha in np.arange(0.01, 5, 0.1):
            print '>>>> alpha=', alpha, ', tolerance =', tolerance

            if innerclf:
                clf.regressor.set_params(alpha=alpha, tol=tolerance)
            else:
                clf.set_params(alpha=alpha, tol=tolerance)

            scores = jjcross_val_score(clf, x_train, y_train, normalized_weighted_gini, cvObj, weights=weights,
                                       verbose=False)
            meanScore = np.mean(scores)
            stdScore = np.std(scores)
            s = 'alpha = %f, tolerance = %f, mean = %f, std = %f\n' % (alpha, tolerance, meanScore, stdScore)
            print s
            res[(alpha, tolerance)] = (meanScore, stdScore)
            cvOutputFile.write(s)
    print '>>>>>> Result sorted by mean score:'
    pprint(sorted(res.items(), key=lambda x: -x[1][0]))
    cvOutputFile.close()

    return res
コード例 #3
0
ファイル: main.py プロジェクト: jennyyuejin/Kaggle
            parentsProportion=0.4, mutationProportion=0.1, mutationProbability=0.1,
            mutationStdDev=None, populationSize=6)

        if cur_bestScore > bestScore:

            bestScore = cur_bestScore
            bestPipe = clone(pipe)
            bestPipe.set_params(**cur_bestParams)
            bestParams = cur_bestParams

    indivClfs.append(bestPipe)
    print '---->', col, '<----', bestScore
    pprint(bestParams)

combinedClf = CombinedClassifier(indivClfs)
print 'OVERALL CV SCORE:', np.mean(jjcross_val_score(combinedClf, X_cal, y_cal, accuracy_score, cv=5, n_jobs=N_JOBS)) # validate classifier

print '====== TRAINING'

_, inputTable_train, outputTable_train, _ = condense_data(trainingName, isTraining=True, readFromFiles = True)
pdf(inputTable_train)

temp = riskFactorImp.fit_transform(inputTable_train)
assert np.isnan(temp.risk_factor).sum() == 0
X_train = Normalizer().fit_transform(Imputer().fit_transform(temp))
# X_train = Normalizer().fit_transform(Imputer().fit_transform(inputTable_train))
y_train = CombinedClassifier.combine_outputs(np.array(outputTable_train))

combinedClf.fit(X_train, y_train)

コード例 #4
0
ファイル: main.py プロジェクト: jennyyuejin/Kaggle
    #                        verbose=1)

    # clf = SVR()

    # ================== CORRELATION ==================
    # print '================== CORRELATION =================='
    # print x_train.shape
    # numFields = 30
    # x_train, newCols = create_new_features(x_train, columns=columns_train)
    # corrs = calculate_y_corrs(x_train, y_train)[0]
    # ord = corrs.argsort()[::-1][:numFields]
    # x_train = x_train[:, ord]

    # ================== CV ==================
    print '================== CV =================='
    scores = jjcross_val_score(regressor, x_train, y_train, normalized_weighted_gini,
                               KFold(len(y_train), n_folds=5, shuffle=True, random_state=0), weights=weights)#, n_jobs=1)

    # ================== Grid Search for the Best Parameter ==================
    # gridSearch(clf, '/home/jj/code/Kaggle/Fire/cvRes/RidgeGroupThenRegress.txt', x_train, y_train, weights, innerclf=True)

    # ================== train ==================
    # print '================== train =================='
    # clf.fit(x_train, y_train, sample_weight=weights)
    # #
    # # # ================== predict ==================
    # print '================== predict =================='
    # x_test, _, ids_pred, _, _, _ = process_data('/home/jj/code/Kaggle/Fire/Data/test.csv',
    #                                          impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median',
    #                                          fieldsToUse=columns_train)
    # pred = clf.predict(x_test)
    # pandas.DataFrame({'id': ids_pred, 'target': pred}).\
コード例 #5
0
ファイル: classify.py プロジェクト: jennyyuejin/Kaggle
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.cross_validation import KFold
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from Kaggle.utilities import plot_histogram, plot_feature_importances, jjcross_val_score
from globalVars import *
from evaluation import normalized_weighted_gini
from utilities import process_data
from correlations import *



x_train, y_regress, _, columns_train, weights, y_class = \
    process_data('/home/jj/code/Kaggle/Fire/Data/train.csv',
                 impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median',
                 fieldsToUse=FIELDS_CLASS_GBC_TOP100[:5])

# print '==================== feature importances =================='
# plot_feature_importances(x_train, y_class, columns_train, numTopFeatures=0.95, numEstimators=50, num_jobs=11)

print '==================== CV =================='
# clf = GradientBoostingClassifier(learning_rate=0.1, loss='deviance')
clf = RandomForestClassifier(n_estimators=50, n_jobs=cpu_count()-2)
# clf.fit(x_train, y_class)
jjcross_val_score(clf, x_train, y_class, roc_auc_score,
                  KFold(len(y_class), n_folds=5, shuffle=True, random_state=0),
                  weights=weights)#, n_jobs=1)
コード例 #6
0
ファイル: helpers.py プロジェクト: jennyyuejin/Kaggle
def quick_score(clf, X, y, cv=5, n_jobs=20):
    """ returns the cv score of a classifier
    """

    return jjcross_val_score(clf, X, y, mean_absolute_error, cv, n_jobs=n_jobs).mean()