def test_knn(): train_data = build_RL_data() test_data = build_RL_data(sd=dt.datetime(2010, 1, 1), ed=dt.datetime(2011, 12, 31)) build_orders(learner=knn.KNNLearner(k=10), filename='knn_10.csv') build_orders(learner=knn.KNNLearner(k=20), filename='knn_20.csv') build_orders(learner=knn.KNNLearner(k=30), filename='knn_30.csv') build_orders(learner=knn.KNNLearner(k=40), filename='knn_40.csv') build_orders(learner=knn.KNNLearner(k=50), filename='knn_50.csv')
def chartOne(): # Get data dates = pd.date_range('2007-12-31', '2009-12-31') symbols = ['ML4T-399'] prices = get_data(symbols, dates) df = getAllData(symbols, dates) #Get the x train and the y train values xTrain = getXTrain(df) yTrain = getYTrain(df) #Create the learner learner = knn.KNNLearner(k=3) #Train the learner learner.addEvidence(xTrain.values, yTrain.values) #Query the learner predicted = learner.query(xTrain.values) predictedDf = pd.DataFrame(data=predicted, index=prices.index, columns=['PRED']) dfYVals = df['YVal'] pricesToUse = prices['ML4T-399'] trainingToUse = pricesToUse * (1 + dfYVals) predictedToUse = pricesToUse * (1 + predictedDf) allToChart = pd.DataFrame(index=prices.index, columns=['PRICES', 'TRAINING_Y', 'PREDICTED_Y']) allToChart['PRICES'] = pricesToUse allToChart['TRAINING_Y'] = trainingToUse allToChart['PREDICTED_Y'] = predictedToUse plot_chart(allToChart, "PRICES, PREDICTED, AND TRAINING DATA FOR ML4T-399")
def applyStrategy(): stock = "IBM" train_features = get_Features('2008-1-1', '2009-12-31', stock) learner = knn.KNNLearner(3) learner.addEvidence( train_features[["BB", "Momentum", "Volatility"]].values, train_features["Training Y"]) test_features = get_Features('2010-1-1', '2010-12-31', stock) test_features["Predicted Y"] = learner.query( test_features[["BB", "Momentum", "Volatility"]].values) test_features[["Volatility"]].plot() plt.show() longbuy = False shortbuy = False count = 0 orders = [] datelen = len(test_features) for i in range(19, datelen - 1): date = test_features.index[i] if (test_features.ix[i, "Predicted Y"] > 0): if (longbuy == False and shortbuy == False): count += 1 plt.axvline(test_features.index[i + 1], color='g') orders.append([test_features.index[i + 1], stock, "BUY", 100]) longbuy = True if (test_features.ix[i, "Predicted Y"] < 0): if (longbuy == True): count += 1 plt.axvline(test_features.index[i + 1], color='black') orders.append([test_features.index[i + 1], stock, "SELL", 100]) longbuy = False continue if (test_features.ix[i, "Predicted Y"] < 0): if (longbuy == False and shortbuy == False): count += 1 plt.axvline(test_features.index[i + 1], color='r') orders.append([test_features.index[i + 1], stock, "SELL", 100]) shortbuy = True if (test_features.ix[i, "Predicted Y"] > 0): if (shortbuy == True): count += 1 plt.axvline(test_features.index[i + 1], color='black') orders.append([test_features.index[i + 1], stock, "BUY", 100]) shortbuy = False #now perform analysis on open/close based on date, etc.. ordersnp = np.array(orders) ordersDataFrame = pd.DataFrame(ordersnp[0:, 1:], index=ordersnp[0:, 0]) ordersDataFrame.columns = ['Symbol', 'Order', 'Shares'] ordersDataFrame.index.name = 'Date' ordersDataFrame.to_csv("C:\Users\Nilav\Documents\ml_mc3_p2_IBM_test.csv") test_features[stock].plot(color='c') plt.show()
def inSampleIbmTest(): #Get the data dates = pd.date_range('2007-12-31', '2009-12-31') symbols = ['IBM'] prices = get_data(symbols, dates) df = getAllData(symbols, dates) #Get the xTrain and yTrain values xTrain = getXTrain(df) yTrain = getYTrain(df) #Create the learner learner = knn.KNNLearner(k=3) #Train the learner learner.addEvidence(xTrain.values, yTrain.values) #Query the learner ON THE IN SAMPLE DATA predicted = learner.query(xTrain.values) predictedDf = pd.DataFrame(data=predicted, index=prices.index, columns=['PRED']) #Call the strategy. Args to pass it are: #The df with the prices data for the test year, for the plot #The df with the predictions for the test year, for the strategy #The symbol myStrategy(df, predictedDf, "IBM", "inSampleIbm.csv", "IN SAMPLE IBM")
def predict_for(input_file, num_to_predict=60): leaner = knn.KNNLearner(verbose=False) data = [convert_line(line) for line in open(input_file, 'r').readlines()] x = data[-1][0] y = data[-1][1] dist = get_dist(data[-2], data[-1]) angle = get_angle(data[-2], data[-1]) k = 5 #print "starting loc:", x, ",", y preds = [] for i in range(num_to_predict): #print "Input:", x, y x, y, angle, dist = leaner.predict(k, x, y, angle, dist) #print "X:", x, "Y:", y, "Angle:", angle, "dist:", dist preds.append((x, y)) with open("prediction.txt", "w") as f: for i in range(len(preds)): f.write(str(int(preds[i][0])) + "," + str(int(preds[i][1])) + "\n") f.close()
def outOfSampleIbmTest(): # Get data for 2008-2009 training dates = pd.date_range('2007-12-31', '2009-12-31') symbols = ['IBM'] prices = get_data(symbols, dates) df = getAllData(symbols, dates) #Get data for 2010 testing. Need the correct index, and the prices in order to plot it. testDates = pd.date_range('2009-12-31', '2010-12-31') df2010Dates = get_data(symbols, testDates) df2010 = getAllData(symbols, testDates) correctIndex = df2010Dates.index #Get the x train and the y train values from the 2008-2009 dataframe xTrain = getXTrain(df) yTrain = getYTrain(df) #Create the learner learner = knn.KNNLearner(k=3) #Train the learner on the 2008-2009 values learner.addEvidence(xTrain.values, yTrain.values) #Get query data for 2010 xTest = getXTrain(df2010) #Query the learner predicted = learner.query(xTest.values) predictedDf = pd.DataFrame(data=predicted, index=correctIndex, columns=['PRED']) #Run the strategy myStrategy(df2010, predictedDf, "IBM", "outOfSampleIBM.csv", "OUT OF SAMPLE IBM")
def run_test(file_num): leaner = knn.KNNLearner(verbose=False) input_file = 'Inputs/test%02d.txt' % file_num data = [convert_line(line) for line in open(input_file, 'r').readlines()] num_test = 120 x = data[0][0] y = data[0][1] dist = get_dist(data[0], data[1]) angle = get_angle(data[0], data[1]) k = 7 print "starting loc:", x, ",", y preds = [] for i in range(num_test): print "Input:", x, y x, y, angle, dist = leaner.predict(k, x, y, angle, dist) print "X:", x, "Y:", y, "Angle:", angle, "dist:", dist preds.append((x, y)) acts = np.array(data[0:num_test]) targets = np.array(preds) plt.scatter(acts[:, 0], acts[:, 1], color='green') plt.scatter(targets[:, 0], targets[:, 1], color='red') plt.show()
def predict_outsamp(X_trn, y_trn, X_tst, y_tst, symbol, start_date, end_date, k): # create a linear regression learner and train it lrlearner = lrl.LinRegLearner() # create a LinRegLearner lrlearner.addEvidence(X_trn, y_trn) # train it ytst_lr = lrlearner.query(X_tst) # create a KNN learner (k=10) and train it knnlearn = knn.KNNLearner(k) # constructor knnlearn.addEvidence(X_trn, y_trn) # training step ytst_knn = knnlearn.query(X_tst) # create a Bag learner and train it baglearn = bl.BagLearner(learner = knn.KNNLearner, kwargs = {"k":k}, bags = 100, boost = False) # constructor baglearn.addEvidence(X_trn, y_trn) # training step ytst_bag = baglearn.query(X_tst) # Combine all models combined = (ytst_lr+ytst_knn+ytst_bag)/3 print "" print "Out of sample predictions for %s data from %s to %s" %(symbol[0], start_date, end_date) print "KNN RMSE %0.4f; LinReg RMSE %0.4f; BagReg RMSE %0.4f; Combined RMSE %0.4f" %(rmse(y_tst, ytst_knn), rmse(y_tst, ytst_lr), rmse(y_tst, ytst_bag), rmse(y_tst, combined)) print "KNN corr %0.4f; LinReg corr %0.4f; BagReg corr %0.4f" %(np.corrcoef(y_tst, ytst_knn)[0,1], np.corrcoef(y_tst, ytst_lr)[0,1], np.corrcoef(y_tst, ytst_bag)[0,1]) print "KNN mean %0.4f; LinReg mean %0.4f; BagReg mean %0.4f" %(abs(y_tst - ytst_knn).mean(), abs(y_tst - ytst_lr).mean(), abs(y_tst - ytst_bag).mean()) print "Actual mean 5 day change %0.4f" %abs(y_tst).mean() print "" return ytst_lr, ytst_knn, ytst_bag
def test(): inf = open('Data/best4KNN.csv') data = np.array( [map(float, s.strip().split(',')) for s in inf.readlines()]) # compute how much of the data is training and testing train_rows = math.floor(0.6 * data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1] # create a learner and train it learner = lrl.LinRegLearner() # create a LinRegLearner learner.addEvidence(trainX, trainY) # train it # evaluate in of sample predYLRL = learner.query(trainX) # get the predictions rmse = math.sqrt(((trainY - predYLRL)**2).sum() / trainY.shape[0]) print print "Linear Regression Learner" print "In sample results" print "RMSE: ", rmse c = np.corrcoef(predYLRL, y=trainY) print "corr: ", c[0, 1] # evaluate out of sample predYLRL = learner.query(testX) # get the predictions rmse = math.sqrt(((testY - predYLRL)**2).sum() / testY.shape[0]) print print "Out of sample results" print "RMSE: ", rmse c = np.corrcoef(predYLRL, y=testY) print "corr: ", c[0, 1] learner = knn.KNNLearner(k=3) # constructor learner.addEvidence(trainX, trainY) # training step predYKNN = learner.query(trainX) # query rmse = math.sqrt(((trainY - predYKNN)**2).sum() / trainY.shape[0]) print print "KNN Learner" print "In sample results" print "RMSE: ", rmse c = np.corrcoef(predYKNN, y=trainY) print "corr: ", c[0, 1] predYKNN = learner.query(testX) # query rmse = math.sqrt(((testY - predYKNN)**2).sum() / testY.shape[0]) print print "Out of sample results" print "RMSE: ", rmse c = np.corrcoef(predYKNN, y=testY) print "corr: ", c[0, 1]
def t(self): sknn = neighbors.KNeighborsRegressor(k) learner = knn.KNNLearner(k, False) #learner that for class ly = learner.addEvidence(self.xtrain, self.ytrain) sy = sknn.fit(self.xtrain, self.ytrain).predict(self.xtest) y = learner.query(self.xtest) rmse = np.linalg.norm(sy - y) / np.sqrt(len(sy)) self.assertLess(rmse, 1e-7)
def SendtoModel(train_df, train_price, test_df, test_price, model='knn', symbol='IBM', k=3, bags=0, verbose=False): """ Sends test and train data frame to selected model Returns predicted test Y and train Y """ #calculate training and test sets trainX = np.array(train_df.iloc[0:,0:-1]) trainY = np.array(train_df.iloc[0:,-1]) testX = np.array(test_df.iloc[0:,0:-1]) testY = np.array(test_df.iloc[0:,-1]) print 'shape testX', testX.shape print 'shape testY', testY.shape if model == 'knn': learner = knn.KNNLearner(k=k, verbose=True) # create a knnLearner learner.addEvidence(trainX, trainY) # train it # evaluate in sample predY_train = learner.query(trainX) # get the predictions rmse_train = math.sqrt(((trainY - predY_train) ** 2).sum()/trainY.shape[0]) # evaluate out of sample predY_test = learner.query(testX) # get the predictions rmse_test = math.sqrt(((testY - predY_test) ** 2).sum()/testY.shape[0]) #output graphs - normalized lines # plot_lines_data(price_norm=train_price, actualY=train_df.iloc[0:,-1], predY=pd.Series(predY_train, index=train_df.index), # name='%s_in_sample_%s' % (symbol[0], model)) # plot_lines_data(price_norm=test_price, actualY=test_df.iloc[0:,-1], predY=pd.Series(predY_test, index=test_df.index), # name='%s_out_sample_%s' % (symbol[0], model)) if verbose: #(a) in sample results print model, 'with arguments k=%s, bags=%s' % (k, bags) print "In sample results" print "RMSE: ", rmse_train c = np.corrcoef(predY_train, y=trainY) print "corr: ", c[0,1] #(b) out of sample results print print model, 'with arguments k=%s, bags=%s' % (k, bags) print "Out of sample results" print "RMSE: ", rmse_test c = np.corrcoef(predY_test, y=testY) print "corr: ", c[0,1] print 'length of predicted values: ', len(predY_test) #print 'print predicted Y values:', predY_test else: pass return predY_train, predY_test
def test(learner=knn.KNNLearner(3), symb='IBM', train_sd=dt.datetime(2007, 12, 31), train_ed=dt.datetime(2009, 12, 31), test_sd=dt.datetime(2007, 12, 31), test_ed=dt.datetime(2009, 12, 31)): #create a learner learner = learner #generate training dataset df, trainX, trainY = getData(train_sd, train_ed, symb) #add evidence learner.addEvidence(trainX, trainY) #generate testing dataset df2, testX, testY = getData(test_sd, test_ed, symb) #use learner to predict value predY = learner.query(testX) #plot correlations between test and predict data, and calculate their correlation plot_corr(testY, predY) c = np.corrcoef(predY, y=testY) print "corr: ", c[0, 1] #generate plot for current price, train price and predict price df2['pred'] = predY df2['fv'] = df2[symb] * (1 + df2['fr']) df2['pv'] = df2[symb] * (1 + df2['pred']) plot_compare(df2[symb], df2['fv'], df2['pv']) #use predict data to execute orders and plot long, short, exit as vertical lines le, se, s = operations(df2, symb) plot_trade(df2[symb], symb, le, se, s) #computer portvals plot backtest results portvals = compute_portvals("orders.csv", start_val=10000) norm_SPY = df2['SPY'] / df2['SPY'][0] * 10000 plot_bt(portvals, symb, norm_SPY) #analysis of portfolio cr, adr, sddr, sr = analysis(portvals) print cr, adr, sddr, sr, portvals[-1]
price_mean = priceC.mean().values price_std = priceC.std().values temp_train = priceC.rename(columns={'ML4T-240': 'price_of_stock'}) temp_train = (priceC.rename(columns={'ML4T-240': 'price_of_stock_norm'}) - price_mean) / price_std #position 4 temp_test = priceC.rename(columns={'ML4T-240': 'price_of_stock'}) #temp_test = (priceC.rename(columns={'ML4T-240':'price_of_stock_norm'})-price_mean)/ price_std train_dataX = get_trainX()[:-5] train_dataY = get_trainY()[20:] test_dataX = get_testX()[:-5] test_dataY = get_testY()[20:] #create learner learner = knn.KNNLearner(k=3, verbose=False) learner.addEvidence(train_dataX, train_dataY) '''train data''' #get df predY_train predY_train = learner.query(train_dataX.values) predY_train_df = train_dataY.copy(deep=True) predY_train_df[:] = predY_train predY_train_df = predY_train_df.to_frame() predY_train_df = predY_train_df.rename(columns={'ML4T-240': 'PredY_train'}) #get df Y_train train_dataY_df = pd.DataFrame(train_dataY) train_dataY_df = train_dataY_df.rename(columns={'ML4T-240': 'Y_train'})
def assess_portfolio(start_date, end_date, symbols): """Simulate and assess the performance of a stock portfolio.""" # Read in adjusted closing prices for given symbols, date range pd.set_option('display.expand_frame_repr', False) dates = pd.date_range(start_date, end_date) prices_all = get_data(symbols, dates) # automatically adds SPY prices_all_temp = prices_all prices = prices_all[symbols] # only portfolio symbols prices_all_temp = prices_all prices_all_temp = calculate(prices_all_temp, symbols, prices) prices_all_temp = shiftY(False,prices_all_temp,symbols) prices_all_test = prices_all_temp.ix['2009-12-31':'2010-12-31'] prices_all_temp = prices_all_temp.ix['2007-12-31':'2008-12-31'] """ prices_all_test = prices_all_temp.ix['2008-12-31':'2009-12-31'] prices_all_test = calculate(prices_all_test, symbols, prices) prices_all_test = shiftY(False,prices_all_test,symbols) prices_all_test = prices_all_test.fillna(method='bfill') prices_all_test = prices_all_test.fillna(method='ffill') prices_all_temp = prices_all_temp.ix['2007-12-31':'2008-12-31'] prices_all_temp = calculate(prices_all_temp, symbols, prices) prices_all_temp = shiftY(False,prices_all_temp,symbols) prices_all_temp = prices_all_temp.fillna(method='bfill') prices_all_temp = prices_all_temp.fillna(method='ffill') """ # separate out training and testing data #print prices_all_temp trainX = prices_all_temp.values[:,1:-2] trainY = prices_all_temp.values[:,-2] testX = prices_all_test.values[:,1:-2] testY = prices_all_test.values[:,-2] #print trainX #learner = bl.BagLearner(learner = knn.KNNLearner, kwargs = {"k":3}, bags = 50, boost = False) #learner.addEvidence(trainX, trainY) learner = knn.KNNLearner(3) learner.addEvidence(trainX, trainY) # train it #learner = lrl.LinRegLearner() #learner.addEvidence(trainX, trainY) # train it predY = learner.query(trainX) # get the predictons generateOrders(prices_all_temp,predY, symbols) rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]) print "---------------- ----- ---------------" print "In sample results" print "RMSE: ", rmse c = np.corrcoef(predY, y=trainY) print "corr: ", c[0,1] #exit() predY = learner.query(testX) # get the predictions generateOrders(prices_all_test,predY, symbols) rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) print "Out of sample results" print "RMSE: ", rmse c = np.corrcoef(predY, y=testY) print "corr: ", c[0,1]
# compute how much of the data is training and testing train_rows = math.floor(0.6 * data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1] #start my test import KNNLearner as knn knnktest = [] for r in range(1, 101): learner = knn.KNNLearner(k=r) # constructor learner.addEvidence(trainX, trainY) # training step predYKNNTrain = learner.query(trainX) # query rmseTrain = math.sqrt( ((trainY - predYKNNTrain)**2).sum() / trainY.shape[0]) print print "KNN Learner" print "In sample results" print "RMSE: ", rmseTrain cTrain = np.corrcoef(predYKNNTrain, y=trainY) print "corr: ", cTrain[0, 1] learner = knn.KNNLearner(k=r) # constructor learner.addEvidence(trainX, trainY) # training step predYKNN = learner.query(testX) # query rmse = math.sqrt(((testY - predYKNN)**2).sum() / testY.shape[0])
""" file_num = 1 df = pd.read_csv('data/data%02d.txt' % file_num, index_col=0) return df if __name__ == "__main__": #process_training_data() #normalize_training_data() #df = load_test_data() #print df learner = knn.KNNLearner() """ ,x,y,angle,dist,drift,xp,yp,x_drift,y_drift,dest_x,dest_y 2,278,64,0.343023940421,14.8660687473,7.21110255093,272,60,-6.0,-4.0,282,60 3,282,60,-0.785398163397,5.65685424949,13.4536240471,292,69,10.0,9.0,296,67 4,296,67,0.463647609001,15.6524758425,14.8660687473,286,56,-10.0,-11.0,306,59 5,306,59,-0.674740942224,12.8062484749,15.5241746963,310,74,4.0,15.0,306,62 6,306,62,1.57079632679,3.0,14.8660687473,316,51,10.0,-11.0,315,61 7,315,61,-0.110657221174,9.05538513814,9.8488578018,306,65,-9.0,4.0,321,64 """ result = learner.predict(3, 278, 64, 0.3430239, 14.866068) print "Dest X:", result[0] print "Dest Y:", result[1] print "Angle :", result[2]
def testlearner(): ''' test KNN and Linear regression learner ''' Xdcp, Ydcp = _csv_read("data-classification-prob.csv") Xdrp, Ydrp = _csv_read( "data-ripple-prob.csv" ) # the data in numpy array now is string instead of float #divide data for train and test dcp_row_N = Xdcp.shape[0] drp_row_N = Xdrp.shape[0] trainperct = 0.6 # data for training is 60% of total data dcp_trp = int(dcp_row_N * trainperct) drp_trp = int(drp_row_N * trainperct) #testperct = 1.0 - trainperct # data for test's percent #data for training Xdcp_train = Xdcp[0:dcp_trp, :] Ydcp_train = np.zeros([dcp_trp, 1]) Ydcp_train[:, 0] = Ydcp[0:dcp_trp] Xdrp_train = Xdrp[0:drp_trp, :] Ydrp_train = np.zeros([drp_trp, 1]) Ydrp_train[:, 0] = Ydrp[0:drp_trp] #data for test (query) Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :] Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1]) Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N] #Ydcp_test = [:, 0:col_n] = Xdata Xdrp_test = Xdrp[drp_trp:drp_row_N, :] Ydrp_test = np.zeros([drp_row_N - drp_trp, 1]) Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N] #KNN learner # result of KNN learn, rows records k, training time cost, query time cost, total time cost, RMSError and Correlation coeffient KNN_dcp_result = np.zeros([7, 50]) # result of data-classification-prob.csv KNN_drp_result = np.zeros([7, 50]) # result of data-ripple-prob.csv for k in range(1, 51): KNN_lner = KNNLearner(k) KNN_dcp_result[0][k - 1] = k KNN_drp_result[0][k - 1] = k # results of data-classification-prob.csv stime = time.time() KNN_lner.addEvidence(Xdcp_train, Ydcp_train) etime = time.time() KNN_dcp_result[1][k - 1] = (etime - stime) / dcp_trp # training time cost stime = time.time() Ydcp_learn = KNN_lner.query(Xdcp_test) etime = time.time() KNN_dcp_result[2][k - 1] = (etime - stime) / (dcp_row_N - dcp_trp ) # query time cost KNN_dcp_result[3][k - 1] = KNN_dcp_result[1][ k - 1] + KNN_dcp_result[2][k - 1] # total time cost #print Ydcp_test #print Ydcp_learn KNN_dcp_result[4][k - 1] = RMSE(Ydcp_test, Ydcp_learn) # Root-Mean-square error KNN_dcp_result[5][k - 1] = np.corrcoef( Ydcp_learn.T, Ydcp_test.T)[0][1] # correlation coefficient Ydcp_osp = KNN_lner.query(Xdcp_train) KNN_dcp_result[6][k - 1] = RMSE( Ydcp_train, Ydcp_osp) # the RMS error between in-sample and out-sample data # results of data-ripple-prob.csv stime = time.time() KNN_lner.addEvidence(Xdrp_train, Ydrp_train) etime = time.time() KNN_drp_result[1][k - 1] = (etime - stime) / drp_trp # training time cost stime = time.time() Ydrp_learn = KNN_lner.query(Xdrp_test) etime = time.time() KNN_drp_result[2][k - 1] = (etime - stime) / (drp_row_N - drp_trp ) # query time cost KNN_drp_result[3][k - 1] = KNN_drp_result[1][ k - 1] + KNN_drp_result[2][k - 1] # total time cost KNN_drp_result[4][k - 1] = RMSE(Ydrp_test, Ydrp_learn) # Root-Mean-Square error KNN_drp_result[5][k - 1] = np.corrcoef( Ydrp_learn.T, Ydrp_test.T)[0][1] # correlation coefficient # insample and outsample error of ripple Ydrp_osp = KNN_lner.query(Xdrp_train) KNN_drp_result[6][k - 1] = RMSE( Ydrp_train, Ydrp_osp) # the RMS error between in-sample and out-sample data #plot the predicted Y vesus actual Y when K = 3 if k == 27: # plot the Y data of classification data plt.clf() fig = plt.figure() fig.suptitle('Y of classification data') #f1 = fig.add_subplot(2, 1, 1) plt.plot(Ydcp_test, Ydcp_learn, 'o', markersize=5) plt.xlabel('Actual Y') plt.ylabel('Predicted Y') #f1.set_title('data-classcification-prob.csv') fig.savefig('classification_Y.pdf', format='pdf') if k == 3: # plot the Y data of ripple data #f2 = fig.add_subplot(2, 1, 2) plt.clf() fig = plt.figure() fig.suptitle('Y of ripple data') plt.plot(Ydrp_test, Ydrp_learn, 'o', markersize=5) plt.xlabel('Actual Y') plt.ylabel('Predicted Y') #f2.set_title('data-ripple-prob.csv') fig.savefig('ripple_Y.pdf', format='pdf') print KNN_dcp_result[:, 2] #the result of k=3 for dcp.csv Kdcp_best_pos = np.argmax(KNN_dcp_result[ 5, :]) #the indices of the maximum correlation coeffiecient print KNN_dcp_result[:, Kdcp_best_pos] print KNN_drp_result[:, 2] #the result of k=3 for drp.csv Kdrp_best_pos = np.argmax( KNN_drp_result[5, :]) #the indices of the maximum correlation print KNN_drp_result[:, Kdrp_best_pos] #plot the correlation plt.clf() fig = plt.figure() plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[5, :], 'r', label='Classification') plt.plot(KNN_drp_result[0, :], KNN_drp_result[5, :], 'b', label='Ripple') plt.legend() plt.xlabel('K') plt.ylabel('Correlation Coefficient') fig.savefig('Correlation_KNN.pdf', format='pdf') #plot the error between in sample and out-of-sample data plt.clf() fig = plt.figure() #f1 = fig.add_subplot(2, 1, 1) fig.suptitle('RMS error of classification data') plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[4, :], 'or', label='out of sample') plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[6, :], 'ob', label='in sample') #f1.axis([0:0.1:1.0] plt.legend(loc=4) plt.xlabel('K') plt.ylabel('RMS Error') fig.savefig('classification-RMSE.pdf', format='pdf') #f1.set_title('data-classification-prob.csv') #f2 = fig.add_subplot(2, 1, 2) plt.clf() fig = plt.figure() fig.suptitle('RMS error of ripple data') plt.plot(KNN_drp_result[0, :], KNN_drp_result[4, :], 'or', label='out of sample') plt.plot(KNN_drp_result[0, :], KNN_drp_result[6, :], 'ob', label='in sample') #f2.axis([0:0.1:1.0] plt.legend(loc=4) plt.xlabel('K') plt.ylabel('RMS Error') #f2.set_title('data-ripple-prob.csv') plt.savefig('ripple-RMSE.pdf', format='pdf') # plot the train time plt.clf() fig = plt.figure() plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[1, :], 'r', label='Classification') plt.plot(KNN_drp_result[0, :], KNN_drp_result[1, :], 'b', label='Ripple') plt.legend(loc=1) plt.xlabel('K') plt.ylabel('train time / s') fig.savefig('traintime.pdf', format='pdf') # plot the query time plt.clf() fig = plt.figure() plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[2, :], 'r', label='Classification') plt.plot(KNN_drp_result[0, :], KNN_drp_result[2, :], 'b', label='Ripple') plt.legend(loc=4) plt.xlabel('K') plt.ylabel('query time / s') fig.savefig('querytime.pdf', format='pdf') # Linear regression LR_lner = LinRegLearner() LR_dcp_result = np.zeros( 5) #Linear regression results of data-classification-prob.csv LR_drp_result = np.zeros( 5) #Linear regression results of data-ripple-prob.csv # results of data-classification-prob.csv stime = time.time() dcp_cof = LR_lner.addEvidence(Xdcp_train, Ydcp_train) etime = time.time() LR_dcp_result[0] = (etime - stime) / dcp_trp # train time cost stime = time.time() Ydcp_LRL = LR_lner.query(Xdcp_test, dcp_cof) etime = time.time() LR_dcp_result[1] = (etime - stime) / (dcp_row_N - dcp_trp ) # query time cost LR_dcp_result[2] = LR_dcp_result[0] + LR_dcp_result[1] # total time cost LR_dcp_result[3] = RMSE(Ydcp_test, Ydcp_LRL) # root-mean-square error LR_dcp_result[4] = np.corrcoef(Ydcp_test.T, Ydcp_LRL.T)[0][1] # correlation efficient print LR_dcp_result # results of data-ripple-prob.csv stime = time.time() drp_cof = LR_lner.addEvidence(Xdrp_train, Ydrp_train) etime = time.time() LR_drp_result[0] = (etime - stime) / drp_trp # train time cost stime = time.time() Ydrp_LRL = LR_lner.query(Xdrp_test, drp_cof) etime = time.time() LR_drp_result[1] = (etime - stime) / (drp_row_N - drp_trp ) # query time cost LR_drp_result[2] = LR_drp_result[0] + LR_drp_result[1] # total time cost LR_drp_result[3] = RMSE(Ydrp_test, Ydrp_LRL) # root-mean-square error LR_drp_result[4] = np.corrcoef(Ydrp_test.T, Ydrp_LRL.T)[0][1] # correlation efficient print LR_drp_result
def main(): trainpercent = 60 methods = ['mean','median'] #read data from data file input = np.loadtxt('data-ripple-prob.csv', delimiter=',') trainsize = math.floor(input.shape[0]*trainpercent/100) #split data into train and test sets Xtrain = input[0:trainsize,:-1] Ytrain = input[0:trainsize,-1] Xtest = input[trainsize:,:-1] Ytest = input[trainsize:,-1] MAXK = 30 NUMCOLS = 5 meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) for method in methods: stats = np.zeros((MAXK, NUMCOLS), dtype=np.float) for k in range(1, MAXK+1): #instantiate learner and test learner = KNNLearner(k, method) #get start time trainstarttime = dt.datetime.now() learner.addEvidence(Xtrain, Ytrain) #get end time and print total time for adding evidnece trainendtime = dt.datetime.now() #get start time teststarttime = dt.datetime.now() Y = learner.query(Xtest) #get end time and print total time for testing testendtime = dt.datetime.now() stats[k-1, 0] = k stats[k-1, 1] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0] stats[k-1, 2] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0] kdtlearner = kdtknn(k, method) #get start time trainstarttime = dt.datetime.now() kdtlearner.addEvidence(Xtrain, Ytrain) #get end time and print total time for adding evidnece trainendtime = dt.datetime.now() #get start time teststarttime = dt.datetime.now() Y = kdtlearner.query(Xtest) #get end time and print total time for testing testendtime = dt.datetime.now() stats[k-1, 3] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0] stats[k-1, 4] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0] if method == 'median': medianstats = stats.copy() else: meanstats = stats.copy() #Graph for time/instance versus corrcoef timedelta = 0.001 outputfilenames = ['mytraining.pdf', 'myquery.pdf', 'kdtknntraining.pdf', 'kdtknnquery.pdf'] titles = ['mytrainingtime/instance', 'myquerytime/instance', 'kdtknntrainingtime/instance', 'kdtknnquerytime/instance'] for index in range(1, NUMCOLS): plt.cla() plt.clf() plt.plot(meanstats[:,0], meanstats[:,index], color='r') plt.plot(medianstats[:,0], medianstats[:,index], color='b') plt.legend(('method=mean', 'method=median'), loc='upper right') plt.ylabel(titles[index-1]) plt.xlabel('k') plt.ylim(min(min(meanstats[:,index]), min(medianstats[:,index]))-timedelta, max(max(meanstats[:,index]), max(medianstats[:,index]))+timedelta) plt.savefig(outputfilenames[index-1],format='pdf')
def main(): trainpercent = 60 isRandomSplit = False filenames = ['data-classification-prob.csv', 'data-ripple-prob.csv'] outputfilenames = ['plot1.pdf', 'plot2.pdf'] trainfilenames = ['traintime1.pdf', 'traintime2.pdf'] testfilenames = ['testtime1.pdf', 'testtime2.pdf'] methods = ['mean', 'median'] for index in range(2): #read data from data file input = np.loadtxt(filenames[index], delimiter=',') trainsize = math.floor(input.shape[0] * trainpercent / 100) #split data into train and test sets Xtrain = input[0:trainsize, :-1] Ytrain = input[0:trainsize, -1] Xtest = input[trainsize:, :-1] Ytest = input[trainsize:, -1] MAXK = 300 NUMCOLS = 4 meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) avgtraintime = -1 avgtesttime = -1 for method in methods: stats = np.zeros((MAXK, NUMCOLS), dtype=np.float) bestcorr = -1000 bestK = -1 for k in range(1, MAXK + 1): #instantiate learner and test learner = KNNLearner(k, method) #get start time trainstarttime = dt.datetime.now() learner.addEvidence(Xtrain, Ytrain) #get end time and print total time for adding evidnece trainendtime = dt.datetime.now() #get start time teststarttime = dt.datetime.now() Y = learner.query(Xtest) #get end time and print total time for testing testendtime = dt.datetime.now() #compute corrcoef corr = np.corrcoef(Ytest.T, Y.T) if corr[0, 1] > bestcorr: bestcorr = corr[0, 1] bestK = k stats[k - 1, 0] = k stats[k - 1, 1] = corr[0, 1] #The total_seconds() method works in python >= 2.7 #stats[k-1, 2] = (trainendtime - trainstarttime).total_seconds()/Xtrain.shape[0] #stats[k-1, 3] = (testendtime - teststarttime).total_seconds()/Xtest.shape[0] stats[k - 1, 2] = gettotalseconds( trainstarttime, trainendtime) / Xtrain.shape[0] stats[k - 1, 3] = gettotalseconds(teststarttime, testendtime) / Xtest.shape[0] if k == 3 and method == 'mean': avgtraintime = stats[k - 1, 2] avgtesttime = stats[k - 1, 3] print 'File:%s Method:%s BestCorrelation:%f K corresponding to best correlation:%f AvgTrainTimeForK3Mean :%f seconds AvgTestTimeForK3Mean:%f seconds' % ( filenames[index], method, bestcorr, bestK, avgtraintime, avgtesttime) if method == 'median': medianstats = stats.copy() else: meanstats = stats.copy() timedelta = 1 #Graph for k versus corrcoef plt.cla() plt.clf() plt.plot(meanstats[:, 0], meanstats[:, 1], color='r') plt.plot(medianstats[:, 0], medianstats[:, 1], color='b') plt.legend(('method=mean', 'method=median'), loc='upper right') plt.ylabel('Correlation Coefficient') plt.xlabel('k') plt.savefig(outputfilenames[index], format='pdf')
corr_coef_knn_out = np.zeros((n)) rms_lr_in = 0 corr_coef_lr_in = 0 rms_lr_out = 0 corr_coef_lr_out = 0 p = 1 plt.clf() plt.plot(Xtrain) plt.show() K = np.zeros((n)) Y_best_knn = [] for k in range(1, n + 1): K[k - 1] = k learner = KNNLearner.KNNLearner(k) learner.addEvidence(Xtrain, Ytrain) Y_out_knn = learner.query(Xtest) sum = 0 for i in range(len(Y_out_knn)): sum += math.pow((Y_out_knn[i] - Ytest[i]), 2) rms_knn_out[k - 1] = math.sqrt(sum / len(Y_out_knn)) corr_coef_knn_out[k - 1] = np.corrcoef(Y_out_knn, Ytest)[0, 1] learner.addEvidence(X, Y) Y_in_knn = learner.query(Xtest) sum = 0 for i in range(len(Y_in_knn)): sum += math.pow((Y_in_knn[i] - Ytest[i]), 2)
print "corr: ", c[0,1] """ print "---------------- KNN ---------------" inSampleError = [] outOfSampleError = [] kArr = [] for i in range(3, 4): #learner = bl.BagLearner(learner = knn.KNNLearner, kwargs = {"k":3}, bags = 20, boost = False) #learner.addEvidence(trainX, trainY) print "BAGS:" print i learner = knn.KNNLearner(k=3) learner.addEvidence(trainX, trainY) # tra kArr.append(i) #learner = knn.KNNLearner(i) #learner.addEvidence(trainX, trainY) # train it predY = learner.query(trainX) # get the predictions rmse = math.sqrt(((trainY - predY)**2).sum() / trainY.shape[0]) print "In sample results" print "RMSE: ", rmse inSampleError.append(rmse) c = np.corrcoef(predY, y=trainY) print "corr: ", c[0, 1] predY = learner.query(testX) # get the predictions rmse = math.sqrt(((testY - predY)**2).sum() / testY.shape[0]) print "Out of sample results"
# separate out training and testing data # trainX = data[:train_rows,0:-1] # trainY = data[:train_rows,-1] # testX = data[train_rows:,0:-1] # testY = data[train_rows:,-1] trainX = dataTrain[:, 0:-1] trainY = dataTrain[:, -1] testX = dataTest[:, 0:-1] testY = dataTest[:, -1] # print(testX.shape) # print(testY.shape) # create a learner and train it start = time.time() learner = knn.KNNLearner(k=3, verbose=True) # create a KNNLearner learner.add_evidence(trainX, trainY) # train it # evaluate in sample Y = learner.query(trainX) # get the predictions rmse = math.sqrt(((trainY - Y)**2).sum() / trainY.shape[0]) # print(learner.model_coefs) print("In sample results") print("RMSE: ", rmse) corr = np.corrcoef(Y, y=trainY) print("corr: ", corr[0, 1]) # # evaluate out of sample Y = learner.query(testX) # get the predictions rmse = math.sqrt(((testY - Y)**2).sum() / testY.shape[0]) print
def testlearner(): ''' test KNN and Linear regression learner ''' Xdcp, Ydcp = _csv_read("data-classification-prob.csv") Xdrp, Ydrp = _csv_read("data-ripple-prob.csv") # the data in numpy array now is string instead of float #divide data for train and test dcp_row_N = Xdcp.shape[0] drp_row_N = Xdrp.shape[0] trainperct = 0.6 # data for training is 60% of total data dcp_trp = int(dcp_row_N * trainperct) drp_trp = int(drp_row_N * trainperct) #testperct = 1.0 - trainperct # data for test's percent #data for training Xdcp_train = Xdcp[0:dcp_trp, :] Ydcp_train = np.zeros([dcp_trp, 1]) Ydcp_train[:, 0] = Ydcp[0:dcp_trp] Xdrp_train = Xdrp[0:drp_trp, :] Ydrp_train = np.zeros([drp_trp, 1]) Ydrp_train[:, 0] = Ydrp[0:drp_trp] #data for test (query) Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :] Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1]) Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N] #Ydcp_test = [:, 0:col_n] = Xdata Xdrp_test = Xdrp[drp_trp:drp_row_N, :] Ydrp_test = np.zeros([drp_row_N - drp_trp, 1]) Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N] #KNN learner # result of KNN learn, rows records k, training time cost, query time cost, total time cost, RMSError and Correlation coeffient KNN_dcp_result = np.zeros([7, 50]) # result of data-classification-prob.csv KNN_drp_result = np.zeros([7, 50]) # result of data-ripple-prob.csv for k in range(1, 51): KNN_lner = KNNLearner(k) KNN_dcp_result[0][k-1] = k KNN_drp_result[0][k-1] = k # results of data-classification-prob.csv stime = time.time() KNN_lner.addEvidence(Xdcp_train, Ydcp_train) etime = time.time() KNN_dcp_result[1][k-1] = (etime - stime) / dcp_trp # training time cost stime = time.time() Ydcp_learn = KNN_lner.query(Xdcp_test) etime = time.time() KNN_dcp_result[2][k-1] = (etime - stime) / (dcp_row_N - dcp_trp) # query time cost KNN_dcp_result[3][k-1] = KNN_dcp_result[1][k-1] + KNN_dcp_result[2][k-1] # total time cost #print Ydcp_test #print Ydcp_learn KNN_dcp_result[4][k-1] = RMSE(Ydcp_test, Ydcp_learn) # Root-Mean-square error KNN_dcp_result[5][k-1] = np.corrcoef(Ydcp_learn.T, Ydcp_test.T)[0][1] # correlation coefficient Ydcp_osp = KNN_lner.query(Xdcp_train) KNN_dcp_result[6][k-1] = RMSE(Ydcp_train, Ydcp_osp) # the RMS error between in-sample and out-sample data # results of data-ripple-prob.csv stime = time.time() KNN_lner.addEvidence(Xdrp_train, Ydrp_train) etime = time.time() KNN_drp_result[1][k-1] = (etime - stime) / drp_trp # training time cost stime = time.time() Ydrp_learn = KNN_lner.query(Xdrp_test) etime = time.time() KNN_drp_result[2][k-1] = (etime - stime) / (drp_row_N - drp_trp) # query time cost KNN_drp_result[3][k-1] = KNN_drp_result[1][k-1] + KNN_drp_result[2][k-1] # total time cost KNN_drp_result[4][k-1] = RMSE(Ydrp_test, Ydrp_learn) # Root-Mean-Square error KNN_drp_result[5][k-1] = np.corrcoef(Ydrp_learn.T, Ydrp_test.T)[0][1] # correlation coefficient # insample and outsample error of ripple Ydrp_osp = KNN_lner.query(Xdrp_train) KNN_drp_result[6][k-1] = RMSE(Ydrp_train, Ydrp_osp) # the RMS error between in-sample and out-sample data #plot the predicted Y vesus actual Y when K = 3 if k == 27: # plot the Y data of classification data plt.clf() fig = plt.figure() fig.suptitle('Y of classification data') #f1 = fig.add_subplot(2, 1, 1) plt.plot(Ydcp_test, Ydcp_learn, 'o', markersize = 5) plt.xlabel('Actual Y') plt.ylabel('Predicted Y') #f1.set_title('data-classcification-prob.csv') fig.savefig('classification_Y.pdf', format = 'pdf') if k == 3: # plot the Y data of ripple data #f2 = fig.add_subplot(2, 1, 2) plt.clf() fig = plt.figure() fig.suptitle('Y of ripple data') plt.plot(Ydrp_test, Ydrp_learn, 'o', markersize = 5) plt.xlabel('Actual Y') plt.ylabel('Predicted Y') #f2.set_title('data-ripple-prob.csv') fig.savefig('ripple_Y.pdf', format = 'pdf') print KNN_dcp_result[:, 2] #the result of k=3 for dcp.csv Kdcp_best_pos = np.argmax(KNN_dcp_result[5, :]) #the indices of the maximum correlation coeffiecient print KNN_dcp_result[:, Kdcp_best_pos] print KNN_drp_result[:, 2] #the result of k=3 for drp.csv Kdrp_best_pos = np.argmax(KNN_drp_result[5, :]) #the indices of the maximum correlation print KNN_drp_result[:, Kdrp_best_pos] #plot the correlation plt.clf() fig = plt.figure() plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[5, :], 'r', label = 'Classification') plt.plot(KNN_drp_result[0, :], KNN_drp_result[5, :], 'b', label = 'Ripple') plt.legend() plt.xlabel('K') plt.ylabel('Correlation Coefficient') fig.savefig('Correlation_KNN.pdf', format = 'pdf') #plot the error between in sample and out-of-sample data plt.clf() fig = plt.figure() #f1 = fig.add_subplot(2, 1, 1) fig.suptitle('RMS error of classification data') plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[4, :], 'or', label = 'out of sample') plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[6, :], 'ob', label = 'in sample') #f1.axis([0:0.1:1.0] plt.legend(loc = 4) plt.xlabel('K') plt.ylabel('RMS Error') fig.savefig('classification-RMSE.pdf', format = 'pdf') #f1.set_title('data-classification-prob.csv') #f2 = fig.add_subplot(2, 1, 2) plt.clf() fig = plt.figure() fig.suptitle('RMS error of ripple data') plt.plot(KNN_drp_result[0, :], KNN_drp_result[4, :], 'or', label = 'out of sample') plt.plot(KNN_drp_result[0, :], KNN_drp_result[6, :], 'ob', label = 'in sample') #f2.axis([0:0.1:1.0] plt.legend(loc = 4) plt.xlabel('K') plt.ylabel('RMS Error') #f2.set_title('data-ripple-prob.csv') plt.savefig('ripple-RMSE.pdf', format = 'pdf') # plot the train time plt.clf() fig = plt.figure() plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[1, :], 'r', label = 'Classification') plt.plot(KNN_drp_result[0, :], KNN_drp_result[1, :], 'b', label = 'Ripple') plt.legend(loc=1) plt.xlabel('K') plt.ylabel('train time / s') fig.savefig('traintime.pdf', format = 'pdf') # plot the query time plt.clf() fig = plt.figure() plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[2, :], 'r', label = 'Classification') plt.plot(KNN_drp_result[0, :], KNN_drp_result[2, :], 'b', label = 'Ripple') plt.legend(loc=4) plt.xlabel('K') plt.ylabel('query time / s') fig.savefig('querytime.pdf', format = 'pdf') # Linear regression LR_lner = LinRegLearner() LR_dcp_result = np.zeros(5) #Linear regression results of data-classification-prob.csv LR_drp_result = np.zeros(5) #Linear regression results of data-ripple-prob.csv # results of data-classification-prob.csv stime = time.time() dcp_cof = LR_lner.addEvidence(Xdcp_train, Ydcp_train) etime = time.time() LR_dcp_result[0] = (etime - stime) / dcp_trp# train time cost stime = time.time() Ydcp_LRL = LR_lner.query(Xdcp_test, dcp_cof) etime = time.time() LR_dcp_result[1] = (etime - stime) / (dcp_row_N - dcp_trp) # query time cost LR_dcp_result[2] = LR_dcp_result[0] + LR_dcp_result[1] # total time cost LR_dcp_result[3] = RMSE(Ydcp_test, Ydcp_LRL) # root-mean-square error LR_dcp_result[4] = np.corrcoef(Ydcp_test.T, Ydcp_LRL.T)[0][1] # correlation efficient print LR_dcp_result # results of data-ripple-prob.csv stime = time.time() drp_cof = LR_lner.addEvidence(Xdrp_train, Ydrp_train) etime = time.time() LR_drp_result[0] = (etime - stime) / drp_trp # train time cost stime = time.time() Ydrp_LRL = LR_lner.query(Xdrp_test, drp_cof) etime = time.time() LR_drp_result[1] = (etime - stime) / (drp_row_N - drp_trp) # query time cost LR_drp_result[2] = LR_drp_result[0] + LR_drp_result[1] # total time cost LR_drp_result[3] = RMSE(Ydrp_test, Ydrp_LRL) # root-mean-square error LR_drp_result[4] = np.corrcoef(Ydrp_test.T, Ydrp_LRL.T)[0][1] # correlation efficient print LR_drp_result
def test_KNN(X_whole, y_whole, X, y): # Split the initial data xtrain , xtest ,ytrain, ytest = train_test_split(X,y,test_size =0.2,random_state =42) start=datetime.now() ### NNLearner Implementation ### knnlearner = knn.KNNLearner(n_folds=3, verbose=True) # Create a validation set - do another train/test split on the training data xtrain_val , xtest_val ,ytrain_val, ytest_val = train_test_split(X,y,test_size =0.2,random_state =42) ########## Initial Learning Curves for Different Neighbor Sizes ########## # 2 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=2) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 2 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_2neigh.png') # 4 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=4) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 4 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_4neigh.png') # 6 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=6) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 6 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_6neigh.png') # 8 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=8) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 8 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_8neigh.png') # 10 neighbors # Initial Fit initial_classifier = KNeighborsClassifier(n_neighbors=10) initial_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Initial Learning Curves (KNN - 10 neighbors)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = initial_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_10neigh.png') # Get a list of possible knn's and their respective neighbor_types flag = 0 clfs, neighbor_types = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the neighbor_type with highest accuracy weight_values = "NA" algorithm_types = "NA" metric_types = "NA" p_values = "NA" knn_choice_neighbor_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective weight values flag = 1 clfs, weight_values = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the weight with highest accuracy neighbor_types = "NA" algorithm_types = "NA" metric_types = "NA" p_values = "NA" knn_choice_weight_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective algorithm_types flag = 2 clfs, algorithm_types = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the algorithm with highest accuracy neighbor_types = "NA" weight_values = "NA" metric_types = "NA" p_values = "NA" knn_choice_algorithm_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective metric types flag = 3 clfs, metric_types = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the metric with highest accuracy neighbor_types = "NA" weight_values = "NA" algorithm_types = "NA" p_values = "NA" knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Get a list of possible knns and their respective p values flag = 4 clfs, p_values = knnlearner.train(xtrain_val,ytrain_val,flag) # Get the knn that is correlated to the p value with highest accuracy neighbor_types = "NA" weight_values = "NA" algorithm_types = "NA" metric_types = ['minkowski'] knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag) # Now that we have the knn, time for tuning hyperparameters # Make a new classifier for this clf = KNeighborsClassifier() clf.fit(xtrain_val, ytrain_val) best_params = knnlearner.tune_hyperparameters(clf, xtrain_val, ytrain_val) print("Best params are: ", best_params) # Now do one more fit based on best params above final_classifier = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],weights=best_params['weights'], algorithm=best_params['algorithm'],metric=best_params['metric'],p=best_params['p']) final_classifier.fit(xtrain_val, ytrain_val) fig, axes = plt.subplots(3, 1, figsize=(10, 15)) title = "Learning Curves (KNN)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = final_classifier lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1) lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve.png') # Now time for final accuracy score for test set knnlearner.final_test(final_classifier,xtest,ytest) print(datetime.now()-start)
def main(): trainpercent = 60 methods = ['mean', 'median'] #read data from data file input = np.loadtxt('data-ripple-prob.csv', delimiter=',') trainsize = math.floor(input.shape[0] * trainpercent / 100) #split data into train and test sets Xtrain = input[0:trainsize, :-1] Ytrain = input[0:trainsize, -1] Xtest = input[trainsize:, :-1] Ytest = input[trainsize:, -1] MAXK = 30 NUMCOLS = 5 meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) for method in methods: stats = np.zeros((MAXK, NUMCOLS), dtype=np.float) for k in range(1, MAXK + 1): #instantiate learner and test learner = KNNLearner(k, method) #get start time trainstarttime = dt.datetime.now() learner.addEvidence(Xtrain, Ytrain) #get end time and print total time for adding evidnece trainendtime = dt.datetime.now() #get start time teststarttime = dt.datetime.now() Y = learner.query(Xtest) #get end time and print total time for testing testendtime = dt.datetime.now() stats[k - 1, 0] = k stats[k - 1, 1] = gettotalseconds(trainstarttime, trainendtime) / Xtrain.shape[0] stats[k - 1, 2] = gettotalseconds(teststarttime, testendtime) / Xtest.shape[0] kdtlearner = kdtknn(k, method) #get start time trainstarttime = dt.datetime.now() kdtlearner.addEvidence(Xtrain, Ytrain) #get end time and print total time for adding evidnece trainendtime = dt.datetime.now() #get start time teststarttime = dt.datetime.now() Y = kdtlearner.query(Xtest) #get end time and print total time for testing testendtime = dt.datetime.now() stats[k - 1, 3] = gettotalseconds(trainstarttime, trainendtime) / Xtrain.shape[0] stats[k - 1, 4] = gettotalseconds(teststarttime, testendtime) / Xtest.shape[0] if method == 'median': medianstats = stats.copy() else: meanstats = stats.copy() #Graph for time/instance versus corrcoef timedelta = 0.001 outputfilenames = [ 'mytraining.pdf', 'myquery.pdf', 'kdtknntraining.pdf', 'kdtknnquery.pdf' ] titles = [ 'mytrainingtime/instance', 'myquerytime/instance', 'kdtknntrainingtime/instance', 'kdtknnquerytime/instance' ] for index in range(1, NUMCOLS): plt.cla() plt.clf() plt.plot(meanstats[:, 0], meanstats[:, index], color='r') plt.plot(medianstats[:, 0], medianstats[:, index], color='b') plt.legend(('method=mean', 'method=median'), loc='upper right') plt.ylabel(titles[index - 1]) plt.xlabel('k') plt.ylim( min(min(meanstats[:, index]), min(medianstats[:, index])) - timedelta, max(max(meanstats[:, index]), max(medianstats[:, index])) + timedelta) plt.savefig(outputfilenames[index - 1], format='pdf')
# separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1] total = 15 in_sample_error = [] out_sample_error = [] k_values = [] for i in range(total): # print "KNNLearner" k_values.append(i + 1) learner = knn.KNNLearner(k=i + 1) # create a KNNLearner learner.addEvidence(trainX, trainY) # train it # evaluate in sample predY = learner.query(trainX) # get the predictions rmse = math.sqrt(((trainY - predY)**2).sum() / trainY.shape[0]) # print # print "In sample results" # print "RMSE: ", rmse in_sample_error.append(rmse) c = np.corrcoef(predY, y=trainY) # print "corr: ", c[0, 1] # evaluate out of sample predY = learner.query(testX) # get the predictions
#generate plot for current price, train price and predict price df2['pred'] = predY df2['fv'] = df2[symb] * (1 + df2['fr']) df2['pv'] = df2[symb] * (1 + df2['pred']) plot_compare(df2[symb], df2['fv'], df2['pv']) #use predict data to execute orders and plot long, short, exit as vertical lines le, se, s = operations(df2, symb) plot_trade(df2[symb], symb, le, se, s) #computer portvals plot backtest results portvals = compute_portvals("orders.csv", start_val=10000) norm_SPY = df2['SPY'] / df2['SPY'][0] * 10000 plot_bt(portvals, symb, norm_SPY) #analysis of portfolio cr, adr, sddr, sr = analysis(portvals) print cr, adr, sddr, sr, portvals[-1] #choose learner, stock symbol, training start date and end date, plus test strat date and end date #in sample test, use same start date and end date #out of sample test, use different start date and end date test(learner=knn.KNNLearner(3), symb='IBM', train_sd=dt.datetime(2007, 12, 31), train_ed=dt.datetime(2009, 12, 31), test_sd=dt.datetime(2007, 12, 31), test_ed=dt.datetime(2009, 12, 31))
test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1] in_rmse_k = [] in_corr_k = [] out_rmse_k = [] out_corr_k = [] model = 'knn' #bag print 'shape of datatset:', data.shape for i in range(2, 21): learner = knn.KNNLearner(k=i, verbose=True) # create a knnLearner # learner = bl.BagLearner(learner=knn.KNNLearner, # kwargs={"k": 5}, bags=20, boost=False, verbose=False) learner.addEvidence(trainX, trainY) # train it predY_train = learner.query(trainX) # get the predictions #get in sample stats in_rmse_k.append( math.sqrt(((trainY - predY_train)**2).sum() / trainY.shape[0])) c_in = np.corrcoef(predY_train, y=trainY) in_corr_k.append(c_in[0, 1]) # get out of sample stats predY_test = learner.query(testX) # get the predictions out_rmse_k.append(
def testlearner(): ''' test Random forest and compare with KNN ''' Xdcp, Ydcp = _csv_read("data-classification-prob.csv") Xdrp, Ydrp = _csv_read("data-ripple-prob.csv") # the data in numpy array now is string instead of float #divide data for train and test dcp_row_N = Xdcp.shape[0] drp_row_N = Xdrp.shape[0] trainperct = 0.6 # data for training is 60% of total data dcp_trp = int(dcp_row_N * trainperct) drp_trp = int(drp_row_N * trainperct) #testperct = 1.0 - trainperct # data for test's percent #data for training Xdcp_train = Xdcp[0:dcp_trp, :] Ydcp_train = np.zeros([dcp_trp, 1]) Ydcp_train[:, 0] = Ydcp[0:dcp_trp] Xdrp_train = Xdrp[0:drp_trp, :] Ydrp_train = np.zeros([drp_trp, 1]) Ydrp_train[:, 0] = Ydrp[0:drp_trp] #data for test (query) Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :] Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1]) Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N] #Ydcp_test = [:, 0:col_n] = Xdata Xdrp_test = Xdrp[drp_trp:drp_row_N, :] Ydrp_test = np.zeros([drp_row_N - drp_trp, 1]) Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N] #print Xdcp_train # result of KNN learn, rows records k, training time cost, query time cost, RMSError and Correlation coeffient DT_dcp_result = np.zeros([5, 100]) # result of data-classification-prob.csv of RF DT_drp_result = np.zeros([5, 100]) # result of data-ripple-prob.csv of RF KNN_dcp_result = np.zeros([2, 100]) # results of data-classification-prob.csv of KNN KNN_drp_result = np.zeros([2, 100]) # results of data-ripple-prob.csv of KNN #print len(RFL.trees) for k in range(1, 101): #k = 30 # Random forest learner RFL = RandomForestLearner(k) KNN_lner = KNNLearner(k) DT_dcp_result[0][k-1] = k DT_drp_result[0][k-1] = k # result of data-classification-prob stime = time.time() RFL.addEvidence(Xdcp_train, Ydcp_train) etime = time.time() DT_dcp_result[1][k-1] = etime - stime KNN_lner.addEvidence(Xdcp_train, Ydcp_train) #print len(RFL.trees) #RFL.trees[0].print_tree(RFL.trees[0].root) stime = time.time() Ydcp_learn = RFL.query(Xdcp_test) etime = time.time() DT_dcp_result[2][k-1] = etime - stime; Ydcp_learn_KNN = KNN_lner.query(Xdcp_test) DT_dcp_result[3][k-1] = RMSE(Ydcp_learn, Ydcp_test) KNN_dcp_result[0][k-1] = RMSE(Ydcp_learn_KNN, Ydcp_test) DT_dcp_result[4][k-1] = np.corrcoef(Ydcp_learn.T, Ydcp_test.T)[0][1] KNN_dcp_result[1][k-1] = np.corrcoef(Ydcp_learn_KNN.T, Ydcp_test.T)[0][1] # result of data-ripple #RFL1 = RandomForestLearner(k) stime = time.time() RFL.addEvidence(Xdrp_train, Ydrp_train) etime = time.time() DT_drp_result[1][k-1] = etime - stime KNN_lner.addEvidence(Xdrp_train, Ydrp_train) #print len(RFL.trees) #RFL.trees[0].print_tree(RFL.trees[0].root) stime = time.time() Ydrp_learn = RFL.query(Xdrp_test) etime = time.time() DT_drp_result[2][k-1] = etime - stime; Ydrp_learn_KNN = KNN_lner.query(Xdrp_test) #print Ydrp_learn_KNN DT_drp_result[3][k-1] = RMSE(Ydrp_learn, Ydrp_test) KNN_drp_result[0][k-1] = RMSE(Ydrp_learn_KNN, Ydrp_test) DT_drp_result[4][k-1] = np.corrcoef(Ydrp_learn.T, Ydrp_test.T)[0][1] KNN_drp_result[1][k-1] = np.corrcoef(Ydrp_learn_KNN.T, Ydrp_test.T)[0][1] #print DT_drp_result[4][k-1] plt.clf() fig = plt.figure() fig.suptitle('RMS Error of Classification data test') plt.plot(DT_dcp_result[0, :], DT_dcp_result[3, :], 'r', label = 'Random Forest') plt.plot(DT_dcp_result[0, :], KNN_dcp_result[0, :], 'b', label = 'KNN') plt.legend(loc = 1) plt.xlabel('K') plt.ylabel('RMS Error') fig.savefig('classification-RMSE.pdf', format = 'pdf') plt.clf() fig = plt.figure() fig.suptitle('Correlation Coefficient of Classification data test') plt.plot(DT_dcp_result[0, :], DT_dcp_result[4, :], 'r', label = 'Random Forest') plt.plot(DT_dcp_result[0, :], KNN_dcp_result[1, :], 'b', label = 'KNN') plt.legend(loc = 4) plt.xlabel('K') plt.ylabel('Correlation Coefficient') fig.savefig('classification-Corr.pdf', format = 'pdf') plt.clf() fig = plt.figure() fig.suptitle('RMS Error of Ripple data test') plt.plot(DT_drp_result[0, :], DT_drp_result[3, :], 'r', label = 'Random Forest') plt.plot(DT_drp_result[0, :], KNN_drp_result[0, :], 'b', label = 'KNN') plt.legend(loc = 2) plt.xlabel('K') plt.ylabel('RMS Error') fig.savefig('ripple-RMSE.pdf', format = 'pdf') plt.clf() fig = plt.figure() fig.suptitle('Correlation Coefficient of Ripple data test') plt.plot(DT_drp_result[0, :], DT_drp_result[4, :], 'r', label = 'Random Forest') plt.plot(DT_drp_result[0, :], KNN_drp_result[1, :], 'b', label = 'KNN') plt.legend(loc = 3) plt.xlabel('K') plt.ylabel('Correlation Coefficient') fig.savefig('ripple-Corr.pdf', format = 'pdf')
def run(): # Define default parameters start_date = '2008-01-01' end_date = '2009-12-31' start_test_date = '2010-01-01' end_test_date = '2010-12-31' stock = 'IBM' #check for user input of stocks and date range if (len(sys.argv) > 1): file_path = "data/" + sys.argv[1] + ".csv" # Check if that file exists if not os.path.exists(file_path) or not os.path.isfile(file_path): print 'Data for the stock specified does not exist. Please reference stocks in the data folder, or run with no option provided (will display IBM data by default)' return stock = sys.argv[1] dates = pd.date_range(start_date, end_date) test_dates = pd.date_range(start_test_date, end_test_date) #read in data that you're going to use prices_all = get_data([stock], dates) # automatically adds SPY test_prices_all = get_data([stock], test_dates) #set up dataframe to train learner over data = pd.DataFrame(index=dates) data['actual_prices'] = prices_all[stock] data['bb_value'] = prices_all[stock] - pd.rolling_mean(prices_all[stock], window=5) data['bb_value'] = data['bb_value'] / ( pd.rolling_std(prices_all[stock], window=5) * 2) data['momentum'] = (prices_all[stock] / prices_all[stock].shift(periods=-5)) - 1 data['volatility'] = pd.rolling_std( ((prices_all[stock] / prices_all[stock].shift(periods=-1)) - 1), window=5) data['y_values'] = prices_all[stock].shift(periods=-5) data = data.dropna(subset=['actual_prices']) trainX = data.iloc[4:, 0:-1] trainY = data.iloc[4:, -1] #set up data frame to test learner over test_data = pd.DataFrame(index=test_dates) test_data['actual_prices'] = test_prices_all[stock] test_data['bb_value'] = test_prices_all[stock] - pd.rolling_mean( test_prices_all[stock], window=5) test_data['bb_value'] = test_data['bb_value'] / ( pd.rolling_std(test_prices_all[stock], window=5) * 2) test_data['momentum'] = (test_prices_all[stock] / test_prices_all[stock].shift(periods=-5)) - 1 test_data['volatility'] = pd.rolling_std( ((test_prices_all[stock] / test_prices_all[stock].shift(periods=-1)) - 1), window=5) test_data['y_values'] = test_prices_all[stock].shift(periods=-5) test_data = test_data.dropna(subset=['actual_prices']) testX = test_data.iloc[:, 0:-1] testY = test_data.iloc[:, -1] #create a KNN Learner for the data and add evidence to it learner = knn.KNNLearner(3) learner.addEvidence(trainX, trainY) #run a simulation of the trading strategy based on predicted future values over training data print "\nTraining Data Results:" run_simulation(learner, prices_all, stock, trainX, trainY, dates, "Unit3/orders/orders_trainingdata.csv") calculate_portfolio_value("Unit3/orders/orders_trainingdata.csv", prices_all, dates, stock) #run a simulation of the trading strategy over previously unseen testing data to test it's performance print "\nTest Data Results:" run_simulation(learner, test_prices_all, stock, testX, testY, test_dates, "Unit3/orders/orders_testdata.csv") calculate_portfolio_value("Unit3/orders/orders_testdata.csv", test_prices_all, test_dates, stock)
def TestKNN(filename, k=3, draw=0): reader = csv.reader(open(filename, 'rU'), delimiter=',') learner = KNN.KNNLearner(k) i = 0 indata = None for row in reader: i = i + 1 temp = numpy.zeros([1, 3]) i = 0 for elements in row: temp[0][i] = string.atof(elements) i = i + 1 if indata is None: indata = temp else: indata = numpy.append(indata, temp, axis=0) start = time.clock() learner.addEvidence(indata[0:split]) traintime = (time.clock() - start) print "Train time is ", traintime start = time.clock() yfitted = numpy.zeros([400]) for i in range(600, 1000): yfitted[i - 600] = learner.query(indata[i]) querytime = (time.clock() - start) / 400 print "Query time is ", querytime cormat = numpy.corrcoef(indata[600:1000, 2], yfitted) print "Correlation coefficient of out sample data is \n", cormat[0][1] dif = yfitted - indata[600:1000, 2] RMS = 0 for err in dif: RMS = RMS + err * err RMS = numpy.sqrt(RMS / 400) print "RMS of out sample data is ", RMS ytfitted = numpy.zeros([600]) for i in range(0, 600): ytfitted[i] = learner.query(indata[i]) cormatoft = numpy.corrcoef(indata[0:600, 2], ytfitted) print "Correlation coefficient of in sample data is \n", cormatoft[0][1] dif = ytfitted - indata[0:600, 2] RMSt = 0 for err in dif: RMSt = RMSt + err * err RMSt = numpy.sqrt(RMSt / 600) print "RMS of in sample data is ", RMSt if (draw == 1): xax = numpy.zeros([400]) for i in range(600, 1000): xax[i - 600] = i plt.plot(xax, yfitted, 'ro') plt.plot(xax, indata[600:1000, 2], 'bo') plt.show() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(indata[600:1000, 0], indata[600:1000, 1], indata[600:1000, 2], c='b') ax.scatter(indata[600:1000, 0], indata[600:1000, 1], yfitted[:], c='r') plt.show() return k, traintime, querytime, cormat[0][1], cormatoft[0][1], RMS, RMSt
bollingerBandDf = l.getBollingerBandVAlue(symbol, dates, volatilityDF) stats = l.getStats(momentumDF, volatilityDF, bollingerBandDf) bollingerBandDf = l.normalizeDataFrame(bollingerBandDf) momentumDF = l.normalizeDataFrame(momentumDF) volatilityDF = l.normalizeDataFrame(volatilityDF) unalteredPrices = util.get_data([symbol], dates, addSPY=False).dropna() fiveDayPriceChange, trainX, trainY, unalteredPrices = l.prepareTrainXandY( bollingerBandDf, fiveDayPriceChange, momentumDF, unalteredPrices, volatilityDF, symbol) #Uncomment the LinRegl and comment the KNN Learner to use that instead of KNN # learner = lrl.LinRegLearner(verbose = True) # create a LinRegLearner learner = knn.KNNLearner(2, verbose=True) # create a knn learner learner.addEvidence(trainX, trainY) # train it predictedYFromTraining = learner.query( trainX ) # get the predictions sy = sknn.fit(trainX, trainY).predict(testX) yPredictedDF = pd.DataFrame(predictedYFromTraining, index=fiveDayPriceChange.index) yPredTimesPriceDF = yPredictedDF.values * unalteredPrices fiveDayPrices = fiveDayPriceChange.values * unalteredPrices yPredTimesPriceDF.columns = ['Predicted Y'] fiveDayPrices.columns = ['Y Train'] symbols = [symbol] unalteredPrices = util.get_data(symbols, dates, addSPY=False) unalteredPrices = unalteredPrices.dropna() normalizedDailyPrices = unalteredPrices / unalteredPrices.ix[0, :]
def main(): trainpercent = 60 isRandomSplit = False filenames = ['data-classification-prob.csv', 'data-ripple-prob.csv'] outputfilenames = ['plot1.pdf', 'plot2.pdf'] trainfilenames = ['traintime1.pdf', 'traintime2.pdf'] testfilenames = ['testtime1.pdf', 'testtime2.pdf'] methods = ['mean','median'] for index in range(2): #read data from data file input = np.loadtxt(filenames[index], delimiter=',') trainsize = math.floor(input.shape[0]*trainpercent/100) #split data into train and test sets Xtrain = input[0:trainsize,:-1] Ytrain = input[0:trainsize,-1] Xtest = input[trainsize:,:-1] Ytest = input[trainsize:,-1] MAXK = 300 NUMCOLS = 4 meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float) avgtraintime = -1 avgtesttime = -1 for method in methods: stats = np.zeros((MAXK, NUMCOLS), dtype=np.float) bestcorr = -1000 bestK = -1 for k in range(1, MAXK+1): #instantiate learner and test learner = KNNLearner(k, method) #get start time trainstarttime = dt.datetime.now() learner.addEvidence(Xtrain, Ytrain) #get end time and print total time for adding evidnece trainendtime = dt.datetime.now() #get start time teststarttime = dt.datetime.now() Y = learner.query(Xtest) #get end time and print total time for testing testendtime = dt.datetime.now() #compute corrcoef corr = np.corrcoef(Ytest.T, Y.T) if corr[0,1] > bestcorr: bestcorr = corr[0,1] bestK = k stats[k-1, 0] = k stats[k-1, 1] = corr[0,1] #The total_seconds() method works in python >= 2.7 #stats[k-1, 2] = (trainendtime - trainstarttime).total_seconds()/Xtrain.shape[0] #stats[k-1, 3] = (testendtime - teststarttime).total_seconds()/Xtest.shape[0] stats[k-1, 2] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0] stats[k-1, 3] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0] if k == 3 and method == 'mean': avgtraintime = stats[k-1,2] avgtesttime = stats[k-1,3] print 'File:%s Method:%s BestCorrelation:%f K corresponding to best correlation:%f AvgTrainTimeForK3Mean :%f seconds AvgTestTimeForK3Mean:%f seconds'%(filenames[index], method, bestcorr, bestK, avgtraintime, avgtesttime) if method == 'median': medianstats = stats.copy() else: meanstats = stats.copy() timedelta = 1 #Graph for k versus corrcoef plt.cla() plt.clf() plt.plot(meanstats[:,0], meanstats[:,1], color='r') plt.plot(medianstats[:,0], medianstats[:,1], color='b') plt.legend(('method=mean', 'method=median'), loc='upper right') plt.ylabel('Correlation Coefficient') plt.xlabel('k') plt.savefig(outputfilenames[index],format='pdf')