def findPlayerFeatures(year,seasonStatsYear,careerData,nonRookiesTrain, rookiesTrain,nonRookiesScaler,rookiesScaler): """ This module finds all player features. -nonRookies have >1 year of NBA experience -rookies have <= 1 year of NBA experience """ nonRookies = pd.DataFrame(columns=['URL','Player','Features','3PA']) rookies = pd.DataFrame(columns=['URL','Player','Features','3PA']) for index,row in seasonStatsYear.iterrows(): last2digits=str(year)[-2:] season = str((datetime.datetime(year,1,1)-datetime.timedelta(days=365)).year)+'-'+last2digits seasonIndex = careerData[row['URL']][careerData[row['URL']]['Season']==season].index[0] if seasonIndex <= 1: rookieFeatures = getFeatures.getRookieFeatures(row['URL']) if isinstance(rookieFeatures,int): if rookieFeatures==1: rookies.loc[len(rookies)] = [row['URL'],row['Player'],'No data available',np.nan] elif rookieFeatures==2: rookies.loc[len(rookies)] = [row['URL'],row['Player'],'Low-Volume 3-Point Shooter',np.nan] else: scaledRookieFeats = rookiesScaler.transform(rookieFeatures) rookies.loc[len(rookies)] = [row['URL'],row['Player'],scaledRookieFeats,rookieFeatures[2]] else: if np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])/(seasonIndex-1) > 10: feat = getFeatures.getNonRookieFeatures(careerData,row,seasonIndex) for i in range(len(feat)): if np.isnan(feat[i]): feat[i] = np.mean(nonRookiesTrain['X'][:,i]) scaledFeat = nonRookiesScaler.transform(feat) nonRookies.loc[len(nonRookies)] = [row['URL'],row['Player'],scaledFeat,np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])] ## print(nonRookies.loc[len(nonRookies)]) else: nonRookies.loc[len(nonRookies)] = [row['URL'],row['Player'],'Low-Volume 3-Point Shooter',np.nan] return(nonRookies,rookies)
def getCrossVal(allData,careerData): """This module validates different techniques for 3P% prediction for veterans using 2010-2014 seasons as test sets. """ res = {} preds = {} for year in range(2010,2015): print(year) try: seasonStats = getData.seasonStatsOnline(year) except: print("Webpage didn't read properly, waiting 30 seconds then trying again") time.sleep(30) X = allData[2000]['X'] y = allData[2000]['y'] for i in range(2001,year): X = np.vstack((X,allData[i]['X'])) y= y+allData[i]['y'] train = {'X': X,'y':np.array(y)} test = {'X':allData[year]['X'],'y':np.array(allData[year]['y'])} scaler = preprocessing.StandardScaler() scaledTrain = {'X':scaler.fit_transform(train['X']),'y':train['y']} scaledTest = {'X':scaler.transform(test['X']),'y':test['y']} knnGrid = grid_search.GridSearchCV(neighbors.KNeighborsRegressor(), param_grid={'n_neighbors':[100],'leaf_size':[1]}, scoring='mean_squared_error') svmGrid = grid_search.GridSearchCV(svm.SVR(), param_grid={'C':[.15],'gamma':[.015],'epsilon':[.05]}, scoring='mean_squared_error') rfGrid = grid_search.GridSearchCV(ensemble.RandomForestRegressor(), param_grid={'n_estimators':[500],'min_samples_split':[125]}, scoring='mean_squared_error') knnGrid.fit(scaledTrain['X'],scaledTrain['y']) svmGrid.fit(scaledTrain['X'],scaledTrain['y']) rfGrid.fit(scaledTrain['X'],scaledTrain['y']) print(knnGrid.best_estimator_) print(svmGrid.best_estimator_) print(rfGrid.best_estimator_) knnPreds = knnGrid.predict(scaledTest['X']) svmPreds = svmGrid.predict(scaledTest['X']) rfPreds = rfGrid.predict(scaledTest['X']) career = []; lastYear = []; ThreePA = []; leagueAverage=np.array([np.mean(train['X'][:,0])]*len(test['y'])) for index,row in seasonStats.iterrows(): last2digits=str(year)[-2:] season = str((datetime.datetime(year,1,1)-datetime.timedelta(days=365)).year)+'-'+last2digits seasonIndex = careerData[row['URL']][careerData[row['URL']]['Season']==season].index[0] if seasonIndex <= 1: continue#yearFeatures.append([np.nan for i in range(7)]) else: rowData = getFeatures.getNonRookieFeatures(careerData,row,seasonIndex) if np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])/(seasonIndex-1) > 10 and not np.isnan(careerData[row['URL']].ix[seasonIndex]['3P%']): career.append(rowData[0]) lastYear.append(rowData[1]) ThreePA.append(np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])) career = np.array(career); lastYear=np.array(lastYear);ThreePA=np.array(ThreePA) preds[year] = {'knn':knnPreds,'svm':svmPreds,'rf':rfPreds, 'career':career,'lastYear':lastYear,'leagueAverage':leagueAverage, 'actual':scaledTest['y'],'3PA':ThreePA} res[year] = {'knn':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,knnPreds*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,knnPreds*100)}, 'svm':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,svmPreds*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,svmPreds*100)}, 'rf':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,rfPreds*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,rfPreds*100)}, 'career':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,career*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,career*100)}, 'leagueAverage': {'MSE':metrics.mean_squared_error(scaledTest['y']*100,leagueAverage*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,leagueAverage*100)}, 'lastYear':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,lastYear*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,lastYear*100)} } print(writeResToPandas({year:res[year]},'nonRookies')) return(res,preds)