def getTrainData(careerData=None,seasonStats=None): """ This module returns a dictionary of all machine learning formatted data for each year. Inputs: careerData - dictionary, optional (default = None) The saved career data Outputs: nonRookieData - dictionary All machine learning formatted data saved by year. The keys are the year. The values is the data dictionary from the getMLDataYear module careerData - dictionary The updated career data logs """ nonRookieData = {} rookieData = {} career3PA={} if not seasonStats: seasonStats = {2015:getData.seasonStatsOnline(2015)} years = list(range(2000,2015)); years.reverse() if not careerData: careerData = getData.getCareerStats(seasonStats[2015]) for year in years: nonRookieData[year],rookieData[year],careerData,nc = getMLDataYear(year,careerData,seasonStats[year]) return(nonRookieData,rookieData,careerData)
def getCrossVal(allData,careerData): """This module validates different techniques for 3P% prediction for veterans using 2010-2014 seasons as test sets. """ res = {} preds = {} for year in range(2010,2015): print(year) try: seasonStats = getData.seasonStatsOnline(year) except: print("Webpage didn't read properly, waiting 30 seconds then trying again") time.sleep(30) X = allData[2000]['X'] y = allData[2000]['y'] for i in range(2001,year): X = np.vstack((X,allData[i]['X'])) y= y+allData[i]['y'] train = {'X': X,'y':np.array(y)} test = {'X':allData[year]['X'],'y':np.array(allData[year]['y'])} scaler = preprocessing.StandardScaler() scaledTrain = {'X':scaler.fit_transform(train['X']),'y':train['y']} scaledTest = {'X':scaler.transform(test['X']),'y':test['y']} knnGrid = grid_search.GridSearchCV(neighbors.KNeighborsRegressor(), param_grid={'n_neighbors':[100],'leaf_size':[1]}, scoring='mean_squared_error') svmGrid = grid_search.GridSearchCV(svm.SVR(), param_grid={'C':[.15],'gamma':[.015],'epsilon':[.05]}, scoring='mean_squared_error') rfGrid = grid_search.GridSearchCV(ensemble.RandomForestRegressor(), param_grid={'n_estimators':[500],'min_samples_split':[125]}, scoring='mean_squared_error') knnGrid.fit(scaledTrain['X'],scaledTrain['y']) svmGrid.fit(scaledTrain['X'],scaledTrain['y']) rfGrid.fit(scaledTrain['X'],scaledTrain['y']) print(knnGrid.best_estimator_) print(svmGrid.best_estimator_) print(rfGrid.best_estimator_) knnPreds = knnGrid.predict(scaledTest['X']) svmPreds = svmGrid.predict(scaledTest['X']) rfPreds = rfGrid.predict(scaledTest['X']) career = []; lastYear = []; ThreePA = []; leagueAverage=np.array([np.mean(train['X'][:,0])]*len(test['y'])) for index,row in seasonStats.iterrows(): last2digits=str(year)[-2:] season = str((datetime.datetime(year,1,1)-datetime.timedelta(days=365)).year)+'-'+last2digits seasonIndex = careerData[row['URL']][careerData[row['URL']]['Season']==season].index[0] if seasonIndex <= 1: continue#yearFeatures.append([np.nan for i in range(7)]) else: rowData = getFeatures.getNonRookieFeatures(careerData,row,seasonIndex) if np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])/(seasonIndex-1) > 10 and not np.isnan(careerData[row['URL']].ix[seasonIndex]['3P%']): career.append(rowData[0]) lastYear.append(rowData[1]) ThreePA.append(np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])) career = np.array(career); lastYear=np.array(lastYear);ThreePA=np.array(ThreePA) preds[year] = {'knn':knnPreds,'svm':svmPreds,'rf':rfPreds, 'career':career,'lastYear':lastYear,'leagueAverage':leagueAverage, 'actual':scaledTest['y'],'3PA':ThreePA} res[year] = {'knn':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,knnPreds*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,knnPreds*100)}, 'svm':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,svmPreds*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,svmPreds*100)}, 'rf':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,rfPreds*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,rfPreds*100)}, 'career':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,career*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,career*100)}, 'leagueAverage': {'MSE':metrics.mean_squared_error(scaledTest['y']*100,leagueAverage*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,leagueAverage*100)}, 'lastYear':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,lastYear*100), 'MAE':metrics.mean_absolute_error(scaledTest['y']*100,lastYear*100)} } print(writeResToPandas({year:res[year]},'nonRookies')) return(res,preds)