def recursiveFeatureElimination(): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures) x, y = np.array(x), np.array(y) # Create the RFE object and compute a cross-validated score. svr = SVR(kernel="linear") rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(x, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.subplot(numRows, numCols, fignum) plt.title(POI['NAME']) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of misclassifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) fignum += 1 plt.show()
def plot(generateX, xLabel='x', yLabel='Taxi Pickups', includeFunc=None): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: print 'POI', POI x, y = loadData(db, POI['LAT'], POI['LONG'], generateX, includeFunc=includeFunc) plt.subplot(numRows, numCols, fignum) plt.scatter(x, y) plt.title(POI['NAME']) plt.xlabel(xLabel) plt.ylabel(yLabel) fignum += 1 plt.show()
def featureSelection(): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: print POI x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeaturesExceptWeather) x, y = np.array(x), np.array(y) ############################################################################### width = 0.6 x_indices = np.arange(x.shape[-1]) ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_regression, percentile=10) selector.fit(x, y) scores = -np.log10(selector.pvalues_) # scores /= scores.max() plt.subplot(numRows, numCols, fignum) plt.bar(x_indices-(width/2), scores, width=width, color='g') plt.title(POI['NAME']) plt.xlabel('Feature number') plt.ylabel('Univariate score ($-Log(p_{value})$)') plt.xticks(x_indices) plt.axis('tight') plt.legend(loc='upper right') fignum += 1 plt.show()
elif prediction < 0: numNegatives += 1 finalPredictions.append(finalPrediction) return zip(inputVectors.keys(), finalPredictions), numNegatives if __name__ == '__main__': test_dataset_filename = TEST_DATASET_FINAL_FILENAME if FINAL else TEST_DATASET_INITIAL_FILENAME testDataset = loadTestDataset(test_dataset_filename) predictions = [] start = time.clock() numNegatives = 0 with DB() as db: for POI in getPointsOfInterest(): print 'POI', POI pipeline = fitPipeline(db, POI['LAT'], POI['LONG'], GENERATE_PIPELINE) POIPredictions, POINegatives = predict(db, pipeline, POI['LAT'], POI['LONG'], testDataset) predictions.extend(POIPredictions) numNegatives += POINegatives print 'Predicted a negative number of taxi pickups %i times' % numNegatives print 'All predictions took %s seconds' % (time.clock() - start) print 'Writing output' idList = [False] * len(testDataset) outputList = [] for locID, prediction in predictions: outputList.append((locID, '%i %i' % (locID, prediction))) idList[locID] = True