Python getDataFrameの例、transform.getDataFrame Pythonの例

コード例 #1

0

ファイルを表示

def standartModel():
    df = transform.getDataFrame()
    dfTest = df[df['shot_made_flag'].isnull()]
    shotSeries = dfTest['shot_id']

    df = df.dropna()

    y = df['shot_made_flag'].values
    X = df.drop([
        'team_name', 'shot_type', 'shot_zone_area', 'combined_shot_type',
        'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent',
        'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                axis=1).values

    testX = dfTest.drop([
        'team_name', 'shot_type', 'shot_zone_area', 'combined_shot_type',
        'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent',
        'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                        axis=1)

    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit(X, y)

    predicted = forest.predict_proba(testX)

    dfPredicted = pd.DataFrame({})
    dfPredicted['shot_id'] = shotSeries
    dfPredicted['shot_made_flag'] = predicted[:, 1]
    dfPredicted.to_csv('data/resultsX1.csv', sep=',', index=False)

コード例 #2

0

ファイルを表示

def crossValScore():
    df = transform.getDataFrame()
    dfTest = df[df['shot_made_flag'].isnull()]
    shotSeries = dfTest['shot_id']

    df = df.dropna()

    y = df['shot_made_flag'].as_matrix()
    X = df.drop([
        'team_name', 'shot_type', 'shot_zone_area', 'combined_shot_type',
        'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent',
        'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                axis=1).as_matrix()

    testX = dfTest.drop([
        'team_name', 'shot_type', 'shot_zone_area', 'combined_shot_type',
        'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent',
        'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                        axis=1).as_matrix()

    clfX = xgb.XGBClassifier(n_estimators=100, max_depth=6)
    clfX = clfX.fit(X, y)

    print cross_val_score(clfX, X, y, scoring="log_loss", cv=8).mean()

    predicted = clfX.predict_proba(testX)

    dfPredicted = pd.DataFrame({})
    dfPredicted['shot_id'] = shotSeries
    dfPredicted['shot_made_flag'] = predicted[:, 1]
    dfPredicted.to_csv('data/resultsX1.csv', sep=',', index=False)

コード例 #3

0

ファイルを表示

def submission():
    df = transform.getDataFrame()
    dfTest = df[df['shot_made_flag'].isnull()]
    shotSeries = dfTest['shot_id']

    df = df.dropna()

    y = df['shot_made_flag'].as_matrix()
    #X = df.drop(['team_name','shot_type', 'game_id', 'shot_zone_area', 'combined_shot_type', 'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent', 'action_type', 'team_id', 'season', 'shot_made_flag'], axis=1).as_matrix()
    X = df.drop([
        'game_event_id', 'lat', 'lon', 'team_name', 'shot_type', 'game_id',
        'shot_zone_area', 'combined_shot_type', 'minutes_remaining',
        'seconds_remaining', 'shot_zone_basic', 'shot_zone_range', 'matchup',
        'opponent', 'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                axis=1).as_matrix()

    testX = dfTest.drop([
        'game_event_id', 'lat', 'lon', 'team_name', 'shot_type', 'game_id',
        'shot_zone_area', 'combined_shot_type', 'minutes_remaining',
        'seconds_remaining', 'shot_zone_basic', 'shot_zone_range', 'matchup',
        'opponent', 'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                        axis=1).values

    #clfX = xgb.XGBClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_child_weight=1, subsample=0.8,scale_pos_weight=1,colsample_bytree=0.8,gamma=0,seed=27)
    clfRF = RandomForestClassifier(n_estimators=100)
    #clfX = xgb.XGBClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_child_weight=1, subsample=0.8,scale_pos_weight=1,colsample_bytree=0.8,gamma=0)
    clfX = xgb.XGBClassifier(n_estimators=50, max_depth=5)
    clfGB = GradientBoostingClassifier(n_estimators=80)

    clfX = clfX.fit(X, y)
    clfRF = clfRF.fit(X, y)
    clfGB = clfGB.fit(X, y)

    predictionX = clfX.predict_proba(testX)
    predictionRF = clfRF.predict_proba(testX)
    predictionGB = clfGB.predict_proba(testX)

    dfPred = pd.DataFrame({})
    dfPred['XG'] = pd.Series(predictionX[:, 1])
    dfPred['RF'] = pd.Series(predictionRF[:, 1])
    dfPred['GB'] = pd.Series(predictionGB[:, 1])
    dfPred['XGRF'] = (2 * dfPred['XG'] + 2 * dfPred['GB'] + dfPred['RF']) / 5

    dfPredicted = pd.DataFrame({})
    dfPredicted['shot_id'] = shotSeries
    dfPredicted['shot_made_flag'] = dfPred['XGRF'].values
    dfPredicted.to_csv('data/resultsXG-RF-GB-matchup2.csv',
                       sep=',',
                       index=False)

コード例 #4

0

ファイルを表示

def XGBoost():
    df = transform.getDataFrame()
    df = df.dropna()
    df = df.drop([
        'team_name', 'shot_type', 'shot_zone_area', 'combined_shot_type',
        'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent',
        'action_type', 'team_id', 'season', 'shot_made_flag', 'shot_made_flag'
    ],
                 axis=1)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.1, random_state=0)
    clfX = xgb.XGBClassifier(n_estimators=100, max_depth=6)
    clfX = clfX.fit(X_train, y_train)
    predicted = clfX.predict_proba(X_test)
    print log_loss(y_test, predicted)

コード例 #5

0

ファイルを表示

def splitData():
    df = transform.getDataFrame()
    df = df.dropna()
    shotSeries = df['shot_id']
    dfTarget = df['shot_made_flag']
    df = df.drop([
        'team_name', 'shot_type', 'shot_zone_area', 'combined_shot_type',
        'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent',
        'action_type', 'team_id', 'season', 'shot_made_flag', 'shot_made_flag'
    ],
                 axis=1)

    X = df.as_matrix()
    y = dfTarget.as_matrix()

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.2, random_state=0)
    #cross_val_score(forest,data,target,scoring="log_loss",cv=6).mean()

    return X_train, X_test, y_train, y_test

コード例 #6

0

ファイルを表示

def KfoldItems():
    df = transform.getDataFrame()
    dfTest = df[df['shot_made_flag'].isnull()]
    shotSeries = dfTest['shot_id']

    df = df.dropna()

    y = df['shot_made_flag'].as_matrix()
    X = df.drop([
        'team_name', 'shot_type', 'game_id', 'shot_zone_area',
        'combined_shot_type', 'shot_zone_basic', 'shot_zone_range', 'matchup',
        'opponent', 'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                axis=1).as_matrix()

    kf = KFold(X.shape[0], n_folds=10, shuffle=True)

    #clfX = xgb.XGBClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_child_weight=1, subsample=0.8,scale_pos_weight=1,colsample_bytree=0.8,gamma=0,seed=27)
    #clfX = xgb.XGBClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_child_weight=1, subsample=0.8,scale_pos_weight=1,colsample_bytree=0.8,gamma=0)
    #clfX = xgb.XGBClassifier(n_estimators=50,max_depth=5)
    clfX = GradientBoostingClassifier(n_estimators=250)

    probs = []

    for train_index, test_index in kf:
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clfX = clfX.fit(X_train, y_train)
        prediction = clfX.predict_proba(X_test)

        print y_test.shape, type(y_test)
        print prediction[:, 1].shape, type(prediction[:, 1])

        loss = transform.logloss(y_test, prediction[:, 1])
        probs.append(loss)

    print probs
    print np.array(probs).mean()

コード例 #7

0

ファイルを表示

def ensemble():
    df = transform.getDataFrame()
    dfTest = df[df['shot_made_flag'].isnull()]
    shotSeries = dfTest['shot_id']

    df = df.dropna()

    y = df['shot_made_flag'].as_matrix()
    X = df.drop([
        'game_event_id', 'lat', 'lon', 'team_name', 'shot_type', 'game_id',
        'shot_zone_area', 'combined_shot_type', 'minutes_remaining',
        'seconds_remaining', 'shot_zone_basic', 'shot_zone_range', 'matchup',
        'opponent', 'action_type', 'team_id', 'season', 'shot_made_flag'
    ],
                axis=1).as_matrix()

    kf = KFold(X.shape[0], n_folds=6, shuffle=True)

    #clfX = xgb.XGBClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_child_weight=1, subsample=0.8,scale_pos_weight=1,colsample_bytree=0.8,gamma=0,seed=27)
    #clfX = xgb.XGBClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_child_weight=1, subsample=0.8,scale_pos_weight=1,colsample_bytree=0.8,gamma=0)
    clfX = xgb.XGBClassifier(n_estimators=50, max_depth=5)
    clfRF = RandomForestClassifier(n_estimators=80)
    clfGB = GradientBoostingClassifier(n_estimators=80)

    probs = []
    probsXG = []
    probsRF = []
    probsGB = []

    for train_index, test_index in kf:
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clfX = clfX.fit(X_train, y_train)
        clfRF = clfRF.fit(X_train, y_train)
        clfGB = clfGB.fit(X_train, y_train)

        predictionX = clfX.predict_proba(X_test)
        predictionRF = clfRF.predict_proba(X_test)
        predictionGB = clfGB.predict_proba(X_test)

        dfPred = pd.DataFrame({})
        dfPred['XG'] = pd.Series(predictionX[:, 1])
        dfPred['RF'] = pd.Series(predictionRF[:, 1])
        dfPred['GB'] = pd.Series(predictionGB[:, 1])

        dfPred['XGRF'] = (2 * dfPred['XG'] + 2 * dfPred['GB'] +
                          dfPred['RF']) / 5

        #print y_test.shape, type(y_test.shape)
        #print dfPred['XGRF'].shape, type(dfPred['XGRF'].shape)

        loss = transform.logloss(y_test, dfPred['XGRF'].values)
        lossXG = transform.logloss(y_test, predictionX[:, 1])
        lossRF = transform.logloss(y_test, predictionRF[:, 1])
        lossGB = transform.logloss(y_test, predictionGB[:, 1])

        probs.append(loss)
        probsXG.append(lossXG)
        probsRF.append(lossRF)
        probsGB.append(lossGB)

    print probs, np.array(probs).mean()
    print probsXG, np.array(probsXG).mean()
    print probsRF, np.array(probsRF).mean()
    print probsGB, np.array(probsGB).mean()