Beispiel #1
0
def getTrainData(careerData=None,seasonStats=None):
    """
    This module returns a dictionary of all machine learning formatted
    data for each year.

    Inputs:
        careerData - dictionary, optional (default = None)
            
            The saved career data

    Outputs:
        nonRookieData - dictionary
        
            All machine learning formatted data saved by year. The keys
            are the year.  The values is the data dictionary from the
            getMLDataYear module
            
        careerData - dictionary
        
            The updated career data logs
    """
    nonRookieData = {}
    rookieData = {}
    career3PA={}
    if not seasonStats: seasonStats = {2015:getData.seasonStatsOnline(2015)}
    years = list(range(2000,2015)); years.reverse()
    if not careerData:
        careerData = getData.getCareerStats(seasonStats[2015])
    for year in years:  
        nonRookieData[year],rookieData[year],careerData,nc = getMLDataYear(year,careerData,seasonStats[year])
    return(nonRookieData,rookieData,careerData)
def getCrossVal(allData,careerData):
    """This module validates different techniques for 3P% prediction for veterans
    using 2010-2014 seasons as test sets.
    """
    res = {}
    preds = {}
    for year in range(2010,2015):
        print(year)
        try:
            seasonStats = getData.seasonStatsOnline(year)
        except: 
            print("Webpage didn't read properly, waiting 30 seconds then trying again")
            time.sleep(30)
        X = allData[2000]['X']
        y = allData[2000]['y']
        for i in range(2001,year):
            X = np.vstack((X,allData[i]['X']))
            y= y+allData[i]['y']
        train = {'X': X,'y':np.array(y)}
        test = {'X':allData[year]['X'],'y':np.array(allData[year]['y'])}
        scaler = preprocessing.StandardScaler()
        scaledTrain = {'X':scaler.fit_transform(train['X']),'y':train['y']}
        scaledTest = {'X':scaler.transform(test['X']),'y':test['y']}
        knnGrid = grid_search.GridSearchCV(neighbors.KNeighborsRegressor(),
            param_grid={'n_neighbors':[100],'leaf_size':[1]},
            scoring='mean_squared_error')
        svmGrid = grid_search.GridSearchCV(svm.SVR(),
            param_grid={'C':[.15],'gamma':[.015],'epsilon':[.05]},
            scoring='mean_squared_error')
        rfGrid = grid_search.GridSearchCV(ensemble.RandomForestRegressor(),
            param_grid={'n_estimators':[500],'min_samples_split':[125]},
            scoring='mean_squared_error')
        knnGrid.fit(scaledTrain['X'],scaledTrain['y'])
        svmGrid.fit(scaledTrain['X'],scaledTrain['y'])
        rfGrid.fit(scaledTrain['X'],scaledTrain['y'])
        print(knnGrid.best_estimator_)
        print(svmGrid.best_estimator_)
        print(rfGrid.best_estimator_)
        knnPreds = knnGrid.predict(scaledTest['X'])
        svmPreds = svmGrid.predict(scaledTest['X'])
        rfPreds = rfGrid.predict(scaledTest['X'])
        career = []; lastYear = []; ThreePA = []; leagueAverage=np.array([np.mean(train['X'][:,0])]*len(test['y']))
        
        for index,row in seasonStats.iterrows():
            last2digits=str(year)[-2:]
            season = str((datetime.datetime(year,1,1)-datetime.timedelta(days=365)).year)+'-'+last2digits
            seasonIndex = careerData[row['URL']][careerData[row['URL']]['Season']==season].index[0]
            if seasonIndex <= 1:
                continue#yearFeatures.append([np.nan for i in range(7)])
            else:
                rowData = getFeatures.getNonRookieFeatures(careerData,row,seasonIndex)
            if np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA'])/(seasonIndex-1) > 10 and not np.isnan(careerData[row['URL']].ix[seasonIndex]['3P%']):
                    career.append(rowData[0])
                    lastYear.append(rowData[1])
                    
                    ThreePA.append(np.sum(careerData[row['URL']].ix[:seasonIndex-1]['3PA']))
                    
        career = np.array(career); lastYear=np.array(lastYear);ThreePA=np.array(ThreePA)
        preds[year] = {'knn':knnPreds,'svm':svmPreds,'rf':rfPreds,
                        'career':career,'lastYear':lastYear,'leagueAverage':leagueAverage,
                        'actual':scaledTest['y'],'3PA':ThreePA}
        res[year] = {'knn':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,knnPreds*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,knnPreds*100)},
                     'svm':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,svmPreds*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,svmPreds*100)},
                     'rf':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,rfPreds*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,rfPreds*100)},
                     'career':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,career*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,career*100)},
                     'leagueAverage': {'MSE':metrics.mean_squared_error(scaledTest['y']*100,leagueAverage*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,leagueAverage*100)},
                     'lastYear':{'MSE':metrics.mean_squared_error(scaledTest['y']*100,lastYear*100),
                            'MAE':metrics.mean_absolute_error(scaledTest['y']*100,lastYear*100)}
                    }
        print(writeResToPandas({year:res[year]},'nonRookies'))
    return(res,preds)