def extracredit1(): reader = csv.reader(open("data-classification-prob1.csv", 'rU'), delimiter=',') data = [] for row in reader: data.append([float(i) for i in row]) d = np.array(data) learner = KNNLearner(27) learner = train(d[0:1000, :], learner) d = [] step = 0.01 for x1 in np.arange(-1, 1, step): for x2 in np.arange(-1, 1, step): d.append([x1, x2]) d = np.array(d) sample = [] for j, i in enumerate(d): if (j % 1000 == 0): print j sample.append(learner.query(i)) fig = p.figure() ax = p3.Axes3D(fig) ax.scatter(d[:, 0], d[:, 1], sample, c='r', marker='o') ax.set_xlabel('X1') ax.set_ylabel('X2') ax.set_zlabel('Y') pp = PdfPages('3d_million_class_actual.pdf') pp.savefig() pp.close() p.show()
def test(filename): Xtrain, Ytrain, Xtest, Ytest = readCsvData(filename) Y = Ytest[:,0] sampleY = Ytrain[:,0] bestY = np.zeros([Ytest.shape[0]]) knnTrainTime = np.zeros([100]) knnQueryTime = np.zeros([100]) knnCorrelation = np.zeros([100]) knnRmsError = np.zeros([100]) kArray = np.zeros([100]) inSampleRmsErr = np.zeros([100]) rfTrainTime = np.zeros([100]) rfQueryTime = np.zeros([100]) rfCorrelation = np.zeros([100]) rfRmsError = np.zeros([100]) #KNN Learner and RF Learner, k vary from 1 to 100 for k in range(1, 101): #KNN kArray[k-1] = k learner = KNNLearner(k) learner.addEvidence(Xtrain, Ytrain) knnTest = learner.query(Xtest) knnY = knnTest[:,-1] #RMS Error(out-of-sample) knnRMS = calRMS(knnY, Y) #Correlation Coefficient knnCorr = calCorrcoef(knnY, Y) knnCorrelation[k-1] = knnCorr knnRmsError[k-1] = knnRMS #RF learner = RandomForestLearner(k) learner.addEvidence(Xtrain, Ytrain) rfTest = learner.query(Xtest) rfY = rfTest[:,-1] #RMS Error(out-of-sample) rfRMS = calRMS(rfY, Y) #Correlation Coefficient rfCorr = calCorrcoef(rfY, Y) rfCorrelation[k-1] = rfCorr rfRmsError[k-1] = rfRMS linename = ['KNN Learner', 'Random Forest Learner'] createComparisonPlot('K value', 'RMS Error', kArray, knnRmsError, rfRmsError, 'RMSComparison.pdf', linename) linename = ['KNN Learner', 'Random Forest Learner'] createComparisonPlot('K value', 'Correlation', kArray, knnCorrelation, rfCorrelation, 'CorrComparison.pdf', linename)
def main(): #read the data class_data = read_data("data-classification-prob.csv") ripple_data = read_data("data-ripple-prob.csv") #convert to numpy array class_arr = np.array(class_data, dtype = np.float) ripple_arr = np.array(ripple_data, dtype = np.float) #split the data into x and y class_x = class_arr[ : , : 2] class_y = class_arr[ : , 2] ripple_x = ripple_arr[ : , : 2] ripple_y = ripple_arr[ : , 2] #create the knn learner learner = KNNLearner(k = 3) learner.addEvidence(class_x, class_y) Y = learner.query(class_x[0 , :])
def wrap_up(symbol, start_date, end_date, out=False): dates = pd.date_range(start_date, end_date) data = get_data(symbol, dates, addSPY=False) data = data.dropna() vector_n = (5, 5, 5) df_data = create_train_data(data, vector_n) trainX = np.array(df_data[['X_1', 'X_2', 'X_3']]) trainY = np.array(df_data[['Y']]) dates_test = pd.date_range('2010-01-01', '2010-12-31') test_data = get_data(symbol, dates_test, addSPY=False) test_data = test_data.dropna() df_test_data = create_train_data(test_data, vector_n) testX = np.array(df_test_data[['X_1', 'X_2', 'X_3']]) testY = np.array(df_test_data[['Y']]) testY = testY[:, 0] learner = KNNLearner(3) # learner = LinRegLearner() learner.addEvidence(trainX, trainY) # train it # evaluate bin sample if out == False: predY = learner.query(trainX) # get the predictions df_data['predY'] = predY return df_data else: predY = learner.query(testX) df_test_data['predY'] = predY return df_test_data
def extracredit2(): reader = csv.reader(open("data-ripple-prob.csv", 'rU'), delimiter=',') data = [] for row in reader: data.append([float(i) for i in row]) d = np.array(data) learner = KNNLearner(27) learner = train(d[0:600, :], learner) insample = test(d[0:600, :], learner) outsample = test(d[600:1000, :], learner) print "RMS Error for KNN" print(rmserror(outsample, d[600:1000, 2])) print "RMS Error for RKNN" print(rmserror(insample, d[0:600, 2])) print i print "Error" print(rmserror(insample, d[0:600, 2])) print(rmserror(outsample, d[600:1000, 2])) print(np.corrcoef(outsample, d[600:1000, 2])[0][1]) plt.scatter( d[600:1000, 2], outsample, ) fig = p.figure() ax = p3.Axes3D(fig) ax.scatter(d[600:1000, 0], d[600:1000, 1], d[600:1000, 2], c='r', marker='o', label="Actual") ax.scatter(d[600:1000, 0], d[600:1000, 1], outsample, c='b', marker='o', label="Predicted") ax.set_xlabel('X1') ax.set_ylabel('X2') ax.set_zlabel('Y') red_patch = mpatches.Patch(color='red', label='Actual') blue_patch = mpatches.Patch(color='blue', label='Predicted') plt.legend(handles=[red_patch, blue_patch]) pp = PdfPages('3d_ripple.pdf') pp.savefig() pp.close() p.show()
def knnlearner_test(filenames): for filename in filenames: rmse_series=[] covariance_series=[] for i in xrange(1,101): knnlearner=KNNLearner(k=i) get_set = knnlearner.getflatcsv(filename) get_set_60pr,get_set_40pr = numpy.split(get_set,[600]) (X,Y) = numpy.split(get_set,[2],axis=1) (XTrain,XTest) = numpy.split(X,[600]) (Ytrain,YTest) = numpy.split(Y,[600]) knnlearner.build_hash(get_set_60pr) knnlearner.addEvidence(XTrain,Ytrain) query_X = numpy.array(XTest) (XY_return,Y_return) = knnlearner.query(XTest) Y_Test = np.squeeze(np.asarray(YTest)) Y_Return = numpy.array(Y_return) rmse_series.append(get_rmse(Y_Test,Y_Return)) covariance_series.append(get_correlation(Y_Test,Y_Return)) return (rmse_series,covariance_series)
def main(): isBagging = True file1 = "data-classification-prob.csv" file2 = "data-ripple-prob.csv" knn_rms1 = np.zeros((101, 1)) knn_corrcoef1 = np.zeros((101, 1)) knn_rms2 = np.zeros((101, 1)) knn_corrcoef2 = np.zeros((101, 1)) randomForest_rms1 = np.zeros((101, 1)) randomForest_corrcoef1 = np.zeros((101, 1)) randomForest_rms2 = np.zeros((101, 1)) randomForest_corrcoef2 = np.zeros((101, 1)) randomForestBagging_corrcoef1 = np.zeros((101, 1)) randomForestBagging_corrcoef2 = np.zeros((101, 1)) randomForestBagging_rms1 = np.zeros((101, 1)) randomForestBagging_rms2 = np.zeros((101, 1)) k = np.arange(1, 101) for i in range(1, 3): if i == 1: print 'Starting with dataset 1....' file = file1 else: print 'Starting with dataset 2....' file = file2 data = getflatcsv(file) XTrain = data[:(len(data) * 0.6), :(len(data[0]) - 1)] XTest = data[(len(data) * 0.6):, :(len(data[0]) - 1)] YTrain = data[:(len(data) * 0.6), -1] YTest = data[(len(data) * 0.6):, -1] if i == 1: YTest1 = YTest else: YTest2 = YTest for j in range(1, 3): if j == 1: print 'Calling KNNLearner for dataset %d...' % i for count in range(1, 101): knnLearner = KNNLearner(k=count) train_t = knnLearner.addEvidence(XTrain, YTrain) Y, test_t = knnLearner.query(XTest) if i == 1: knn_rms1[count, 0], knn_corrcoef1[count, 0] = getstats(Y, YTest) else: knn_rms2[count, 0], knn_corrcoef2[count, 0] = getstats(Y, YTest) elif j == 2: print 'Calling RandomForestLearner for dataset %d...' % i for count in range(1, 101): if isBagging: randomForestLearner = RandomForestLearner( k=count, isBagging=True) randomForestLearner.addEvidence(XTrain, YTrain) Y = randomForestLearner.query(XTest) if i == 1: randomForestBagging_rms1[ count, 0], randomForestBagging_corrcoef1[ count, 0] = getstats(Y, YTest) print count, randomForestBagging_corrcoef1[count, 0] else: randomForestBagging_rms2[ count, 0], randomForestBagging_corrcoef2[ count, 0] = getstats(Y, YTest) print count, randomForestBagging_corrcoef2[count, 0] randomForestLearner = RandomForestLearner(k=count, isBagging=False) randomForestLearner.addEvidence(XTrain, YTrain) Y = randomForestLearner.query(XTest) if i == 1: randomForest_rms1[count, 0], randomForest_corrcoef1[ count, 0] = getstats(Y, YTest) print count, randomForest_corrcoef1[count, 0] else: randomForest_rms2[count, 0], randomForest_corrcoef2[ count, 0] = getstats(Y, YTest) print count, randomForest_corrcoef2[count, 0] if isBagging: plt.ylabel('Random Forest:Corelation Coefficient - dataset 1') plt.xlabel('K') plt.legend(['Without Bagging', 'With Bagging']) plt.plot(k, randomForest_corrcoef1[1:], k, randomForestBagging_corrcoef1[1:]) plt.savefig('bagging_corr1.png') plt.close() plt.ylabel('Random Forest:Corelation Coefficient - dataset 2') plt.xlabel('K') plt.legend(['Without Bagging', 'With Bagging']) plt.plot(k, randomForest_corrcoef2[1:], k, randomForestBagging_corrcoef2[1:]) plt.savefig('bagging_corr2.png') plt.close() plt.ylabel('Corelation Coefficient - dataset 1') plt.xlabel('K') plt.legend(['KNN', 'Random Forest']) plt.plot(k, knn_corrcoef1[1:], k, randomForest_corrcoef1[1:]) plt.savefig('corr1.png') plt.close() plt.ylabel('Corelation Coefficient - dataset 2') plt.xlabel('K') plt.legend(['KNN', 'Random Forest']) plt.plot(k, knn_corrcoef2[1:], k, randomForest_corrcoef2[1:]) plt.savefig('corr2.png') plt.close() plt.ylabel('RMS - dataset 1') plt.xlabel('K') plt.legend(['KNN', 'Random Forest']) plt.plot(k, knn_rms1[1:], k, randomForest_rms1[1:]) plt.savefig('Compare_RMS1.png') plt.close() plt.ylabel('RMS - dataset 2') plt.xlabel('K') plt.legend(['KNN', 'Random Forest']) plt.plot(k, knn_rms2[1:], k, randomForest_rms2[1:]) plt.savefig('Compare_RMS2.png') plt.close()
number = 101 kArr = [i for i in range(1, number)] for k in kArr: print "Training a new learner" learner = RandomForest(k) learner.addEvidence(Xtrain, Ytrain) Y_ltest = [] Y_ltrain = [] for x in Xtest: Y_ltest.append(learner.query(x)) # for x in Xtrain: # Y_ltrain.append(learner.query(x)) #rmsArr_train.append(math.sqrt(np.mean((np.array(Y_ltrain) - np.array(Ytrain)) ** 2))) rmsArr_test.append( math.sqrt(np.mean((np.array(Y_ltest) - np.array(Ytest))**2))) learner = KNNLearner(k) learner = train(d[0:600, :], learner) outsample = test(d[600:1000:], learner) rmsArr_k.append(rmserror(outsample, d[600:1000, 2])) pp = PdfPages('Data-Classification-K-RandomForest.pdf') plt.clf() plt.ylabel('RMS') plt.xlabel('k') isE = plt.plot(range(1, number), rmsArr_k, 'g-', label='KNN') osE = plt.plot(range(1, number), rmsArr_test, 'r-', label='Random Forest') red_patch = mpatches.Patch(color='red', label='Random Forest') green_patch = mpatches.Patch(color='green', label='KNN') plt.legend(handles=[red_patch, green_patch]) plt.title("Data-Classification-K-RandomForest") pp.savefig()
def test(filename): Xtrain, Ytrain, Xtest, Ytest = readCsvData(filename) Y = Ytest[:, 0] sampleY = Ytrain[:, 0] bestY = np.zeros([Ytest.shape[0]]) trainTime = np.zeros([50]) queryTime = np.zeros([50]) correlation = np.zeros([50]) rmsError = np.zeros([50]) kArray = np.zeros([50]) inSampleRmsErr = np.zeros([50]) #KNN Learner, k vary from 1 to 50 for k in range(1, 51): kArray[k - 1] = k learner = KNNLearner(k) knnTrainStime = time.time() learner.addEvidence(Xtrain, Ytrain) knnTrainEtime = time.time() knnQueryStime = time.time() knnTest = learner.query(Xtest) knnQueryEtime = time.time() knnY = knnTest[:, -1] #Avg Train Time per Instance avgKnnTrainTime = (knnTrainEtime - knnTrainStime) / Xtrain.shape[0] #Avg Query Time per Instance avgKnnQueryTime = (knnQueryEtime - knnQueryStime) / Xtest.shape[0] #RMS Error(out-of-sample) knnRMS = calRMS(knnY, Y) #In-sample RMS Error inSampleTest = learner.query(Xtrain) inSampleY = inSampleTest[:, -1] insampleRMS = calRMS(inSampleY, sampleY) #Correlation Coefficient knnCorr = calCorrcoef(knnY, Y) trainTime[k - 1] = avgKnnTrainTime queryTime[k - 1] = avgKnnQueryTime correlation[k - 1] = knnCorr rmsError[k - 1] = knnRMS inSampleRmsErr[k - 1] = insampleRMS if ((filename == 'data-classification-prob.csv') and (k == 27)): print k bestY = knnY elif ((filename == 'data-ripple-prob.csv') and (k == 3)): print k bestY = knnY createLinePlot('K value', 'Avg Train Time/Instance', kArray, trainTime, 'traintime.pdf', 'Average Train Time') createLinePlot('K value', 'Avg Query Time/Instance', kArray, queryTime, 'querytime.pdf', 'Average Query Time') createLinePlot('K value', 'Correlation', kArray, correlation, 'correlation.pdf', 'Correlation Coefficient of Predicted Y versus Actual Y') createLinePlot('K value', 'RMS Error', kArray, rmsError, 'rms.pdf', 'RMS Error between Predicted Y versus Actual Y') linename = ['Out-of-Sample Data', 'In-Sample Data'] createComparisonPlot('K value', 'RMS Error', kArray, rmsError, inSampleRmsErr, 'comparison.pdf', linename) createScatterPlot('Predicted Y', 'Actual Y', bestY, Y, 'bestK.pdf') #Linear Regression Learner learner = LinRegLearner() linTrainStime = time.time() learner.addEvidence(Xtrain, Ytrain) linTrainEtime = time.time() linQueryStime = time.time() linTest = learner.query(Xtest) linQueryEtime = time.time() linY = linTest[:, -1] #Avg Train Time per Instance avgLinTrainTime = (linTrainEtime - linTrainStime) / Xtrain.shape[0] #Avg Query Time per Instance avgLinQueryTime = (linQueryEtime - linQueryStime) / Xtest.shape[0] print avgLinTrainTime, avgLinQueryTime #RMS Error linRMS = calRMS(linY, Y) print linRMS #Correlation Coefficient linCorr = calCorrcoef(linY, Y) print linCorr
def extracredit3(): print "Entering function" outsampleError = [] outsampleError_R = [] insampleError = [] insampleError_R = [] reader = csv.reader(open("data-classification-prob1.csv", 'rU'), delimiter=',') data = [] for row in reader: data.append([float(i) for i in row]) d = np.array(data) number = 51 for i in range(1, number): learner = KNNLearner(i) learner = train(d[0:600, :], learner) insample = test(d[0:600, :], learner) outsample = test(d[600:1000, :], learner) insampleError.append(rmserror(insample, d[0:600, 2])) outsampleError.append(rmserror(outsample, d[600:1000, 2])) learner = RKNNLearner(i) learner = train(d[0:600, :], learner) insample = test(d[0:600, :], learner) outsample = test(d[600:1000, :], learner) insampleError_R.append(rmserror(insample, d[0:600, 2])) outsampleError_R.append(rmserror(outsample, d[600:1000, 2])) k = 0 for x, y in zip(outsample, d[600:1000, 2]): if (x == y): k += 1 accuracy_plot.append(k / 400.0) #print i ,"has accuracy", k/400.0 plt.plot(range(1, number), accuracy_plot) plt.show() print len(insampleError) pp = PdfPages('RKNN_vs_CKNN.pdf') isE = plt.plot(range(1, number), insampleError, 'g-', label='Insample error for KNN') osE = plt.plot(range(1, number), outsampleError, 'r-', label='Outsample Error for KNN') red_patch = mpatches.Patch(color='red', label='outsampleError for KNN') green_patch = mpatches.Patch(color='green', label='insampleError for KNN') isE_R = plt.plot(range(1, number), insampleError_R, 'b-', label='Insample error for classsifier KNN ') osE_R = plt.plot(range(1, number), outsampleError_R, 'y-', label='Outsample Error for classsifier KNN') blue_patch = mpatches.Patch(color='blue', label='insampleError for classsifier KNN') yellow_patch = mpatches.Patch(color='yellow', label='outsampleError for classsifier KNN') plt.legend(handles=[red_patch, green_patch, blue_patch, yellow_patch]) plt.ylabel("Error") plt.xlabel("K") plt.title("Vanilla KNN vs Classifier KNN") #plt.show() pp.savefig() pp.close() Best = min(enumerate(outsampleError_R), key=itemgetter(1))[0] print "Best ", Best print np.min(outsampleError_R) plt.show()
def test(filename): Xtrain, Ytrain, Xtest, Ytest = readCsvData(filename) Y = Ytest[:,0] sampleY = Ytrain[:,0] bestY = np.zeros([Ytest.shape[0]]) trainTime = np.zeros([50]) queryTime = np.zeros([50]) correlation = np.zeros([50]) rmsError = np.zeros([50]) kArray = np.zeros([50]) inSampleRmsErr = np.zeros([50]) #KNN Learner, k vary from 1 to 50 for k in range(1, 51): kArray[k-1] = k learner = KNNLearner(k) knnTrainStime = time.time() learner.addEvidence(Xtrain, Ytrain) knnTrainEtime = time.time() knnQueryStime = time.time() knnTest = learner.query(Xtest) knnQueryEtime = time.time() knnY = knnTest[:,-1] #Avg Train Time per Instance avgKnnTrainTime = (knnTrainEtime - knnTrainStime)/Xtrain.shape[0] #Avg Query Time per Instance avgKnnQueryTime = (knnQueryEtime - knnQueryStime)/Xtest.shape[0] #RMS Error(out-of-sample) knnRMS = calRMS(knnY, Y) #In-sample RMS Error inSampleTest = learner.query(Xtrain) inSampleY = inSampleTest[:,-1] insampleRMS = calRMS(inSampleY, sampleY) #Correlation Coefficient knnCorr = calCorrcoef(knnY, Y) trainTime[k-1] = avgKnnTrainTime queryTime[k-1] = avgKnnQueryTime correlation[k-1] = knnCorr rmsError[k-1] = knnRMS inSampleRmsErr[k-1] = insampleRMS if((filename == 'data-classification-prob.csv') and (k == 27)): print k bestY = knnY elif((filename == 'data-ripple-prob.csv') and (k == 3)): print k bestY = knnY createLinePlot('K value', 'Avg Train Time/Instance', kArray, trainTime, 'traintime.pdf', 'Average Train Time') createLinePlot('K value', 'Avg Query Time/Instance', kArray, queryTime, 'querytime.pdf', 'Average Query Time') createLinePlot('K value', 'Correlation', kArray, correlation, 'correlation.pdf', 'Correlation Coefficient of Predicted Y versus Actual Y') createLinePlot('K value', 'RMS Error', kArray, rmsError, 'rms.pdf', 'RMS Error between Predicted Y versus Actual Y') linename = ['Out-of-Sample Data', 'In-Sample Data'] createComparisonPlot('K value', 'RMS Error', kArray, rmsError, inSampleRmsErr, 'comparison.pdf', linename) createScatterPlot('Predicted Y', 'Actual Y', bestY, Y, 'bestK.pdf') #Linear Regression Learner learner = LinRegLearner() linTrainStime = time.time() learner.addEvidence(Xtrain, Ytrain) linTrainEtime = time.time() linQueryStime = time.time() linTest = learner.query(Xtest) linQueryEtime = time.time() linY = linTest[:,-1] #Avg Train Time per Instance avgLinTrainTime = (linTrainEtime - linTrainStime)/Xtrain.shape[0] #Avg Query Time per Instance avgLinQueryTime = (linQueryEtime - linQueryStime)/Xtest.shape[0] print avgLinTrainTime, avgLinQueryTime #RMS Error linRMS = calRMS(linY, Y) print linRMS #Correlation Coefficient linCorr = calCorrcoef(linY, Y) print linCorr
def main(): isBagging = True file1 = "data-classification-prob.csv" file2 = "data-ripple-prob.csv" knn_rms1 = np.zeros((101,1)) knn_corrcoef1 = np.zeros((101,1)) knn_rms2 = np.zeros((101,1)) knn_corrcoef2 = np.zeros((101,1)) randomForest_rms1 = np.zeros((101,1)) randomForest_corrcoef1 = np.zeros((101,1)) randomForest_rms2 = np.zeros((101,1)) randomForest_corrcoef2 = np.zeros((101,1)) randomForestBagging_corrcoef1 = np.zeros((101,1)) randomForestBagging_corrcoef2 = np.zeros((101,1)) randomForestBagging_rms1 = np.zeros((101,1)) randomForestBagging_rms2 = np.zeros((101,1)) k = np.arange(1,101) for i in range(1,3): if i == 1: print 'Starting with dataset 1....' file = file1 else: print 'Starting with dataset 2....' file = file2 data = getflatcsv(file) XTrain = data[:(len(data)*0.6),:(len(data[0])-1)] XTest = data[(len(data)*0.6):,:(len(data[0])-1)] YTrain = data[:(len(data)*0.6),-1] YTest = data[(len(data)*0.6):,-1] if i == 1: YTest1 = YTest else: YTest2 = YTest for j in range(1,3): if j == 1: print 'Calling KNNLearner for dataset %d...' % i for count in range(1,101): knnLearner = KNNLearner(k=count) train_t = knnLearner.addEvidence(XTrain, YTrain) Y, test_t = knnLearner.query(XTest) if i == 1: knn_rms1[count,0], knn_corrcoef1[count,0] = getstats(Y, YTest) else: knn_rms2[count,0], knn_corrcoef2[count,0] = getstats(Y, YTest) elif j == 2: print 'Calling RandomForestLearner for dataset %d...' % i for count in range(1,101): if isBagging: randomForestLearner = RandomForestLearner(k=count, isBagging = True) randomForestLearner.addEvidence(XTrain, YTrain) Y = randomForestLearner.query(XTest) if i == 1: randomForestBagging_rms1[count,0], randomForestBagging_corrcoef1[count,0] = getstats(Y, YTest) print count, randomForestBagging_corrcoef1[count,0] else: randomForestBagging_rms2[count,0], randomForestBagging_corrcoef2[count,0] = getstats(Y, YTest) print count, randomForestBagging_corrcoef2[count,0] randomForestLearner = RandomForestLearner(k=count, isBagging = False) randomForestLearner.addEvidence(XTrain, YTrain) Y = randomForestLearner.query(XTest) if i == 1: randomForest_rms1[count,0], randomForest_corrcoef1[count,0] = getstats(Y, YTest) print count, randomForest_corrcoef1[count,0] else: randomForest_rms2[count,0], randomForest_corrcoef2[count,0] = getstats(Y, YTest) print count, randomForest_corrcoef2[count,0] if isBagging: plt.ylabel('Random Forest:Corelation Coefficient - dataset 1') plt.xlabel('K') plt.legend(['Without Bagging','With Bagging']) plt.plot(k, randomForest_corrcoef1[1:], k, randomForestBagging_corrcoef1[1:]); plt.savefig('bagging_corr1.png') plt.close() plt.ylabel('Random Forest:Corelation Coefficient - dataset 2') plt.xlabel('K') plt.legend(['Without Bagging','With Bagging']) plt.plot(k, randomForest_corrcoef2[1:], k, randomForestBagging_corrcoef2[1:]); plt.savefig('bagging_corr2.png') plt.close() plt.ylabel('Corelation Coefficient - dataset 1') plt.xlabel('K') plt.legend(['KNN','Random Forest']) plt.plot(k, knn_corrcoef1[1:], k, randomForest_corrcoef1[1:]); plt.savefig('corr1.png') plt.close() plt.ylabel('Corelation Coefficient - dataset 2') plt.xlabel('K') plt.legend(['KNN','Random Forest']) plt.plot(k, knn_corrcoef2[1:], k, randomForest_corrcoef2[1:]); plt.savefig('corr2.png') plt.close() plt.ylabel('RMS - dataset 1') plt.xlabel('K') plt.legend(['KNN','Random Forest']) plt.plot(k, knn_rms1[1:], k, randomForest_rms1[1:]) plt.savefig('Compare_RMS1.png') plt.close() plt.ylabel('RMS - dataset 2') plt.xlabel('K') plt.legend(['KNN','Random Forest']) plt.plot(k, knn_rms2[1:], k, randomForest_rms2[1:]) plt.savefig('Compare_RMS2.png') plt.close()
def knnlearner_test(filenames): for filename in filenames: train_time =[] query_time =[] rmse_series=[] rmse_series_insample=[] covariance_series=[] for i in xrange(1,51): knnlearner=KNNLearner(k=i) get_set = knnlearner.getflatcsv(filename) get_set_60pr,get_set_40pr = numpy.split(get_set,[600]) (X,Y) = numpy.split(get_set,[2],axis=1) (XTrain,XTest) = numpy.split(X,[600]) (Ytrain,YTest) = numpy.split(Y,[600]) knnlearner.build_hash(get_set_60pr) with Timer() as t: knnlearner.addEvidence(XTrain,Ytrain) train_time.append(t.interval) query_X = numpy.array(XTest) with Timer() as t: (XY_return,Y_return) = knnlearner.query(XTest) query_time.append(t.interval) Y_Test = np.squeeze(np.asarray(YTest)) Y_Return = numpy.array(Y_return) rmse_series.append(get_rmse(Y_Test,Y_Return)) (XY_return_insample,Y_return_insample) = knnlearner.query(XTrain) Y_Train = np.squeeze(np.asarray(Ytrain)) Y_return_insample = numpy.array(Y_return_insample) rmse_series_insample.append(get_rmse(Y_Train,Y_return_insample)) covariance_series.append(get_correlation(Y_Test,Y_Return)) min_rmse = min(float(i) for i in rmse_series) k_index = rmse_series.index(min_rmse) print "best k = ",k_index+1," for ",filename knnlearner_scatter = KNNLearner(k=k_index+1) get_set = knnlearner_scatter.getflatcsv(filename) get_set_60pr,get_set_40pr = numpy.split(get_set,[600]) (X,Y) = numpy.split(get_set,[2],axis=1) (XTrain,XTest) = numpy.split(X,[600]) (Ytrain,YTest) = numpy.split(Y,[600]) knnlearner_scatter.build_hash(get_set_60pr) knnlearner_scatter.addEvidence(XTrain,Ytrain) (XY_return,Y_return) = knnlearner_scatter.query(XTest) Y_Test = np.squeeze(np.asarray(YTest)) Y_Return = numpy.array(Y_return) scatter(Y_Return,Y_Test,"scatterplot("+filename+")(for bestk).pdf") get_graph(numpy.arange(1,51),train_time,"K","Train time in seconds","KNN_Train_time("+filename+").pdf",4) get_graph(numpy.arange(1,51),query_time,"K","Query time in seconds","KNN_Query_time("+filename+").pdf",4) get_graph(numpy.arange(1,51),rmse_series,"K","RMSE Error","RMSEvsk("+filename+").pdf") get_graph(numpy.arange(1,51),covariance_series,"K","Covariance Coefficeint","Covariance Coeff vs K("+filename+").pdf") get_graph_two_plots(numpy.arange(1,51),rmse_series_insample,rmse_series,"K","RMSE","insample_error_vs_outsample_error("+filename+").pdf")