Example #1
0
def findPlayerFeatures(year,seasonStatsYear,careerData,nonRookiesTrain,
        rookiesTrain,nonRookiesScaler,rookiesScaler):
    """
    This module finds all player features.  
        -nonRookies have >1 year of NBA experience
        -rookies have <= 1 year of NBA experience
    """
    nonRookies = pd.DataFrame(columns=['URL','Player','Features','3PA'])
    rookies = pd.DataFrame(columns=['URL','Player','Features','3PA'])
    for index,row in seasonStatsYear.iterrows():
        last2digits=str(year)[-2:]
        season = str((datetime.datetime(year,1,1)-datetime.timedelta(days=365)).year)+'-'+last2digits
        seasonIndex = careerData[row['URL']][careerData[row['URL']]['Season']==season].index[0]
        if seasonIndex <= 1:
            rookieFeatures = getFeatures.getRookieFeatures(row['URL'])
            if isinstance(rookieFeatures,int):
                if rookieFeatures==1: rookies.loc[len(rookies)] = [row['URL'],row['Player'],'No data available',np.nan]
                elif rookieFeatures==2: rookies.loc[len(rookies)] = [row['URL'],row['Player'],'Low-Volume 3-Point Shooter',np.nan]
            else:
                scaledRookieFeats = rookiesScaler.transform(rookieFeatures)
                rookies.loc[len(rookies)] = [row['URL'],row['Player'],scaledRookieFeats,rookieFeatures[2]]
        else:
            if np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])/(seasonIndex-1) > 10:
                feat = getFeatures.getNonRookieFeatures(careerData,row,seasonIndex)
                for i in range(len(feat)):
                    if np.isnan(feat[i]): feat[i] = np.mean(nonRookiesTrain['X'][:,i])
                scaledFeat = nonRookiesScaler.transform(feat)
                nonRookies.loc[len(nonRookies)] = [row['URL'],row['Player'],scaledFeat,np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])]
##                print(nonRookies.loc[len(nonRookies)])
            else: nonRookies.loc[len(nonRookies)] = [row['URL'],row['Player'],'Low-Volume 3-Point Shooter',np.nan]
    return(nonRookies,rookies)
Example #2
0
def getCrossVal(allData,careerData):
    """This module validates different techniques for 3P% prediction for veterans
    using 2010-2014 seasons as test sets.
    """
    res = {}
    preds = {}
    for year in range(2010,2015):
        print(year)
        try:
            seasonStats = getData.seasonStatsOnline(year)
        except: 
            print("Webpage didn't read properly, waiting 30 seconds then trying again")
            time.sleep(30)
        X = allData[2000]['X']
        y = allData[2000]['y']
        for i in range(2001,year):
            X = np.vstack((X,allData[i]['X']))
            y= y+allData[i]['y']
        train = {'X': X,'y':np.array(y)}
        test = {'X':allData[year]['X'],'y':np.array(allData[year]['y'])}
        scaler = preprocessing.StandardScaler()
        scaledTrain = {'X':scaler.fit_transform(train['X']),'y':train['y']}
        scaledTest = {'X':scaler.transform(test['X']),'y':test['y']}
        knnGrid = grid_search.GridSearchCV(neighbors.KNeighborsRegressor(),
            param_grid={'n_neighbors':[100],'leaf_size':[1]},
            scoring='mean_squared_error')
        svmGrid = grid_search.GridSearchCV(svm.SVR(),
            param_grid={'C':[.15],'gamma':[.015],'epsilon':[.05]},
            scoring='mean_squared_error')
        rfGrid = grid_search.GridSearchCV(ensemble.RandomForestRegressor(),
            param_grid={'n_estimators':[500],'min_samples_split':[125]},
            scoring='mean_squared_error')
        knnGrid.fit(scaledTrain['X'],scaledTrain['y'])
        svmGrid.fit(scaledTrain['X'],scaledTrain['y'])
        rfGrid.fit(scaledTrain['X'],scaledTrain['y'])
        print(knnGrid.best_estimator_)
        print(svmGrid.best_estimator_)
        print(rfGrid.best_estimator_)
        knnPreds = knnGrid.predict(scaledTest['X'])
        svmPreds = svmGrid.predict(scaledTest['X'])
        rfPreds = rfGrid.predict(scaledTest['X'])
        career = []; lastYear = []; ThreePA = []; leagueAverage=np.array([np.mean(train['X'][:,0])]*len(test['y']))
        
        for index,row in seasonStats.iterrows():
            last2digits=str(year)[-2:]
            season = str((datetime.datetime(year,1,1)-datetime.timedelta(days=365)).year)+'-'+last2digits
            seasonIndex = careerData[row['URL']][careerData[row['URL']]['Season']==season].index[0]
            if seasonIndex <= 1:
                continue#yearFeatures.append([np.nan for i in range(7)])
            else:
                rowData = getFeatures.getNonRookieFeatures(careerData,row,seasonIndex)
            if np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])/(seasonIndex-1) > 10 and not np.isnan(careerData[row['URL']].ix[seasonIndex]['3P%']):
                    career.append(rowData[0])
                    lastYear.append(rowData[1])
                    
                    ThreePA.append(np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA']))
                    
        career = np.array(career); lastYear=np.array(lastYear);ThreePA=np.array(ThreePA)
        preds[year] = {'knn':knnPreds,'svm':svmPreds,'rf':rfPreds,
                        'career':career,'lastYear':lastYear,'leagueAverage':leagueAverage,
                        'actual':scaledTest['y'],'3PA':ThreePA}
        res[year] = {'knn':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,knnPreds*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,knnPreds*100)},
                     'svm':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,svmPreds*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,svmPreds*100)},
                     'rf':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,rfPreds*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,rfPreds*100)},
                     'career':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,career*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,career*100)},
                     'leagueAverage': {'MSE':metrics.mean_squared_error(scaledTest['y']*100,leagueAverage*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,leagueAverage*100)},
                     'lastYear':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,lastYear*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,lastYear*100)}
                    }
        print(writeResToPandas({year:res[year]},'nonRookies'))
    return(res,preds)