Ejemplo n.º 1
0
def testlearner():
    '''
	test KNN and Linear regression learner
	'''

    Xdcp, Ydcp = _csv_read("data-classification-prob.csv")
    Xdrp, Ydrp = _csv_read(
        "data-ripple-prob.csv"
    )  # the data in numpy array now is string instead of float

    #divide data for train and test
    dcp_row_N = Xdcp.shape[0]
    drp_row_N = Xdrp.shape[0]
    trainperct = 0.6  # data for training is 60% of total data
    dcp_trp = int(dcp_row_N * trainperct)
    drp_trp = int(drp_row_N * trainperct)
    #testperct = 1.0 - trainperct # data for test's percent
    #data for training
    Xdcp_train = Xdcp[0:dcp_trp, :]
    Ydcp_train = np.zeros([dcp_trp, 1])
    Ydcp_train[:, 0] = Ydcp[0:dcp_trp]
    Xdrp_train = Xdrp[0:drp_trp, :]
    Ydrp_train = np.zeros([drp_trp, 1])
    Ydrp_train[:, 0] = Ydrp[0:drp_trp]
    #data for test (query)
    Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :]
    Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1])
    Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N]
    #Ydcp_test = [:, 0:col_n] = Xdata
    Xdrp_test = Xdrp[drp_trp:drp_row_N, :]
    Ydrp_test = np.zeros([drp_row_N - drp_trp, 1])
    Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N]

    #KNN learner

    # result of KNN learn, rows records k, training time cost, query time cost, total time cost, RMSError and Correlation coeffient
    KNN_dcp_result = np.zeros([7,
                               50])  # result of data-classification-prob.csv
    KNN_drp_result = np.zeros([7, 50])  # result of data-ripple-prob.csv

    for k in range(1, 51):
        KNN_lner = KNNLearner(k)
        KNN_dcp_result[0][k - 1] = k
        KNN_drp_result[0][k - 1] = k

        # results of data-classification-prob.csv
        stime = time.time()
        KNN_lner.addEvidence(Xdcp_train, Ydcp_train)
        etime = time.time()
        KNN_dcp_result[1][k -
                          1] = (etime - stime) / dcp_trp  # training time cost

        stime = time.time()
        Ydcp_learn = KNN_lner.query(Xdcp_test)
        etime = time.time()
        KNN_dcp_result[2][k - 1] = (etime - stime) / (dcp_row_N - dcp_trp
                                                      )  # query time cost

        KNN_dcp_result[3][k - 1] = KNN_dcp_result[1][
            k - 1] + KNN_dcp_result[2][k - 1]  # total time cost

        #print Ydcp_test
        #print Ydcp_learn
        KNN_dcp_result[4][k - 1] = RMSE(Ydcp_test,
                                        Ydcp_learn)  # Root-Mean-square error

        KNN_dcp_result[5][k - 1] = np.corrcoef(
            Ydcp_learn.T, Ydcp_test.T)[0][1]  # correlation coefficient

        Ydcp_osp = KNN_lner.query(Xdcp_train)
        KNN_dcp_result[6][k - 1] = RMSE(
            Ydcp_train,
            Ydcp_osp)  # the RMS error between in-sample and out-sample data

        # results of data-ripple-prob.csv
        stime = time.time()
        KNN_lner.addEvidence(Xdrp_train, Ydrp_train)
        etime = time.time()
        KNN_drp_result[1][k -
                          1] = (etime - stime) / drp_trp  # training time cost

        stime = time.time()
        Ydrp_learn = KNN_lner.query(Xdrp_test)
        etime = time.time()
        KNN_drp_result[2][k - 1] = (etime - stime) / (drp_row_N - drp_trp
                                                      )  # query time cost

        KNN_drp_result[3][k - 1] = KNN_drp_result[1][
            k - 1] + KNN_drp_result[2][k - 1]  # total time cost

        KNN_drp_result[4][k - 1] = RMSE(Ydrp_test,
                                        Ydrp_learn)  # Root-Mean-Square error

        KNN_drp_result[5][k - 1] = np.corrcoef(
            Ydrp_learn.T, Ydrp_test.T)[0][1]  # correlation coefficient

        # insample and outsample error of ripple
        Ydrp_osp = KNN_lner.query(Xdrp_train)
        KNN_drp_result[6][k - 1] = RMSE(
            Ydrp_train,
            Ydrp_osp)  # the RMS error between in-sample and out-sample data

        #plot the predicted Y vesus actual Y when K = 3
        if k == 27:
            # plot the Y data of classification data
            plt.clf()
            fig = plt.figure()
            fig.suptitle('Y of classification data')
            #f1 = fig.add_subplot(2, 1, 1)
            plt.plot(Ydcp_test, Ydcp_learn, 'o', markersize=5)
            plt.xlabel('Actual Y')
            plt.ylabel('Predicted Y')
            #f1.set_title('data-classcification-prob.csv')
            fig.savefig('classification_Y.pdf', format='pdf')

        if k == 3:
            # plot the Y data of ripple data
            #f2 = fig.add_subplot(2, 1, 2)
            plt.clf()
            fig = plt.figure()
            fig.suptitle('Y of ripple data')
            plt.plot(Ydrp_test, Ydrp_learn, 'o', markersize=5)
            plt.xlabel('Actual Y')
            plt.ylabel('Predicted Y')
            #f2.set_title('data-ripple-prob.csv')
            fig.savefig('ripple_Y.pdf', format='pdf')

    print KNN_dcp_result[:, 2]  #the result of k=3 for dcp.csv
    Kdcp_best_pos = np.argmax(KNN_dcp_result[
        5, :])  #the indices of the maximum correlation coeffiecient
    print KNN_dcp_result[:, Kdcp_best_pos]

    print KNN_drp_result[:, 2]  #the result of k=3 for drp.csv
    Kdrp_best_pos = np.argmax(
        KNN_drp_result[5, :])  #the indices of the maximum correlation
    print KNN_drp_result[:, Kdrp_best_pos]

    #plot the correlation
    plt.clf()
    fig = plt.figure()
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[5, :],
             'r',
             label='Classification')
    plt.plot(KNN_drp_result[0, :], KNN_drp_result[5, :], 'b', label='Ripple')
    plt.legend()
    plt.xlabel('K')
    plt.ylabel('Correlation Coefficient')
    fig.savefig('Correlation_KNN.pdf', format='pdf')

    #plot the error between in sample and out-of-sample data
    plt.clf()
    fig = plt.figure()
    #f1 = fig.add_subplot(2, 1, 1)
    fig.suptitle('RMS error of classification data')
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[4, :],
             'or',
             label='out of sample')
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[6, :],
             'ob',
             label='in sample')
    #f1.axis([0:0.1:1.0]
    plt.legend(loc=4)
    plt.xlabel('K')
    plt.ylabel('RMS Error')

    fig.savefig('classification-RMSE.pdf', format='pdf')
    #f1.set_title('data-classification-prob.csv')

    #f2 = fig.add_subplot(2, 1, 2)
    plt.clf()
    fig = plt.figure()
    fig.suptitle('RMS error of ripple data')
    plt.plot(KNN_drp_result[0, :],
             KNN_drp_result[4, :],
             'or',
             label='out of sample')
    plt.plot(KNN_drp_result[0, :],
             KNN_drp_result[6, :],
             'ob',
             label='in sample')
    #f2.axis([0:0.1:1.0]
    plt.legend(loc=4)
    plt.xlabel('K')
    plt.ylabel('RMS Error')
    #f2.set_title('data-ripple-prob.csv')
    plt.savefig('ripple-RMSE.pdf', format='pdf')

    # plot the train time
    plt.clf()
    fig = plt.figure()
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[1, :],
             'r',
             label='Classification')
    plt.plot(KNN_drp_result[0, :], KNN_drp_result[1, :], 'b', label='Ripple')
    plt.legend(loc=1)
    plt.xlabel('K')
    plt.ylabel('train time / s')
    fig.savefig('traintime.pdf', format='pdf')

    # plot the query time
    plt.clf()
    fig = plt.figure()
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[2, :],
             'r',
             label='Classification')
    plt.plot(KNN_drp_result[0, :], KNN_drp_result[2, :], 'b', label='Ripple')
    plt.legend(loc=4)
    plt.xlabel('K')
    plt.ylabel('query time / s')
    fig.savefig('querytime.pdf', format='pdf')

    # Linear regression
    LR_lner = LinRegLearner()
    LR_dcp_result = np.zeros(
        5)  #Linear regression results of data-classification-prob.csv
    LR_drp_result = np.zeros(
        5)  #Linear regression results of data-ripple-prob.csv

    # results of data-classification-prob.csv
    stime = time.time()
    dcp_cof = LR_lner.addEvidence(Xdcp_train, Ydcp_train)
    etime = time.time()
    LR_dcp_result[0] = (etime - stime) / dcp_trp  # train time cost

    stime = time.time()
    Ydcp_LRL = LR_lner.query(Xdcp_test, dcp_cof)
    etime = time.time()
    LR_dcp_result[1] = (etime - stime) / (dcp_row_N - dcp_trp
                                          )  # query time cost

    LR_dcp_result[2] = LR_dcp_result[0] + LR_dcp_result[1]  # total time cost

    LR_dcp_result[3] = RMSE(Ydcp_test, Ydcp_LRL)  # root-mean-square error

    LR_dcp_result[4] = np.corrcoef(Ydcp_test.T,
                                   Ydcp_LRL.T)[0][1]  # correlation efficient

    print LR_dcp_result

    # results of data-ripple-prob.csv
    stime = time.time()
    drp_cof = LR_lner.addEvidence(Xdrp_train, Ydrp_train)
    etime = time.time()
    LR_drp_result[0] = (etime - stime) / drp_trp  # train time cost

    stime = time.time()
    Ydrp_LRL = LR_lner.query(Xdrp_test, drp_cof)
    etime = time.time()
    LR_drp_result[1] = (etime - stime) / (drp_row_N - drp_trp
                                          )  # query time cost

    LR_drp_result[2] = LR_drp_result[0] + LR_drp_result[1]  # total time cost

    LR_drp_result[3] = RMSE(Ydrp_test, Ydrp_LRL)  # root-mean-square error

    LR_drp_result[4] = np.corrcoef(Ydrp_test.T,
                                   Ydrp_LRL.T)[0][1]  # correlation efficient

    print LR_drp_result
def main():
    trainpercent = 60
    isRandomSplit = False

    filenames = ['data-classification-prob.csv', 'data-ripple-prob.csv']
    outputfilenames = ['plot1.pdf', 'plot2.pdf']
    trainfilenames = ['traintime1.pdf', 'traintime2.pdf']
    testfilenames = ['testtime1.pdf', 'testtime2.pdf']
    methods = ['mean', 'median']

    for index in range(2):
        #read data from data file
        input = np.loadtxt(filenames[index], delimiter=',')
        trainsize = math.floor(input.shape[0] * trainpercent / 100)

        #split data into train and test sets
        Xtrain = input[0:trainsize, :-1]
        Ytrain = input[0:trainsize, -1]
        Xtest = input[trainsize:, :-1]
        Ytest = input[trainsize:, -1]

        MAXK = 300
        NUMCOLS = 4

        meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
        medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)

        avgtraintime = -1
        avgtesttime = -1

        for method in methods:
            stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
            bestcorr = -1000
            bestK = -1

            for k in range(1, MAXK + 1):
                #instantiate learner and test
                learner = KNNLearner(k, method)

                #get start time
                trainstarttime = dt.datetime.now()
                learner.addEvidence(Xtrain, Ytrain)
                #get end time and print total time for adding evidnece
                trainendtime = dt.datetime.now()

                #get start time
                teststarttime = dt.datetime.now()
                Y = learner.query(Xtest)
                #get end time and print total time for testing
                testendtime = dt.datetime.now()

                #compute corrcoef
                corr = np.corrcoef(Ytest.T, Y.T)
                if corr[0, 1] > bestcorr:
                    bestcorr = corr[0, 1]
                    bestK = k

                stats[k - 1, 0] = k
                stats[k - 1, 1] = corr[0, 1]
                #The total_seconds() method works in python >= 2.7
                #stats[k-1, 2] = (trainendtime - trainstarttime).total_seconds()/Xtrain.shape[0]
                #stats[k-1, 3] = (testendtime - teststarttime).total_seconds()/Xtest.shape[0]
                stats[k - 1, 2] = gettotalseconds(
                    trainstarttime, trainendtime) / Xtrain.shape[0]
                stats[k - 1, 3] = gettotalseconds(teststarttime,
                                                  testendtime) / Xtest.shape[0]

                if k == 3 and method == 'mean':
                    avgtraintime = stats[k - 1, 2]
                    avgtesttime = stats[k - 1, 3]

            print 'File:%s Method:%s BestCorrelation:%f K corresponding to best correlation:%f AvgTrainTimeForK3Mean :%f seconds AvgTestTimeForK3Mean:%f seconds' % (
                filenames[index], method, bestcorr, bestK, avgtraintime,
                avgtesttime)

            if method == 'median':
                medianstats = stats.copy()
            else:
                meanstats = stats.copy()

        timedelta = 1

        #Graph for k versus corrcoef
        plt.cla()
        plt.clf()
        plt.plot(meanstats[:, 0], meanstats[:, 1], color='r')
        plt.plot(medianstats[:, 0], medianstats[:, 1], color='b')
        plt.legend(('method=mean', 'method=median'), loc='upper right')
        plt.ylabel('Correlation Coefficient')
        plt.xlabel('k')
        plt.savefig(outputfilenames[index], format='pdf')
Ejemplo n.º 3
0
def main():
  trainpercent = 60
  methods = ['mean','median']
  
  #read data from data file
  input = np.loadtxt('data-ripple-prob.csv', delimiter=',')
  trainsize = math.floor(input.shape[0]*trainpercent/100)

  #split data into train and test sets 
  Xtrain = input[0:trainsize,:-1]
  Ytrain = input[0:trainsize,-1]
  Xtest = input[trainsize:,:-1]
  Ytest = input[trainsize:,-1]

  MAXK = 30
  NUMCOLS = 5
  
  meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
  medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
  
  for method in methods:
    stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    
    for k in range(1, MAXK+1):
      #instantiate learner and test
      learner = KNNLearner(k, method)
    
      #get start time
      trainstarttime = dt.datetime.now()
      learner.addEvidence(Xtrain, Ytrain)
      #get end time and print total time for adding evidnece
      trainendtime = dt.datetime.now()
    
      #get start time
      teststarttime = dt.datetime.now()
      Y = learner.query(Xtest)
      #get end time and print total time for testing
      testendtime = dt.datetime.now()
      
      stats[k-1, 0] = k
      stats[k-1, 1] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0]
      stats[k-1, 2] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0]
      
      kdtlearner = kdtknn(k, method)
      #get start time
      trainstarttime = dt.datetime.now()
      kdtlearner.addEvidence(Xtrain, Ytrain)
      #get end time and print total time for adding evidnece
      trainendtime = dt.datetime.now()
    
      #get start time
      teststarttime = dt.datetime.now()
      Y = kdtlearner.query(Xtest)
      #get end time and print total time for testing
      testendtime = dt.datetime.now()
      
      stats[k-1, 3] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0]
      stats[k-1, 4] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0]
      
    if method == 'median':
      medianstats = stats.copy()
    else: 
      meanstats = stats.copy()
  
  #Graph for time/instance versus corrcoef
  timedelta = 0.001
  outputfilenames = ['mytraining.pdf', 'myquery.pdf', 'kdtknntraining.pdf', 'kdtknnquery.pdf']
  titles = ['mytrainingtime/instance', 'myquerytime/instance', 'kdtknntrainingtime/instance', 'kdtknnquerytime/instance']
  for index in range(1, NUMCOLS):
    plt.cla()
    plt.clf()
    plt.plot(meanstats[:,0], meanstats[:,index], color='r')
    plt.plot(medianstats[:,0], medianstats[:,index], color='b')
    plt.legend(('method=mean', 'method=median'), loc='upper right')
    plt.ylabel(titles[index-1])
    plt.xlabel('k')
    plt.ylim(min(min(meanstats[:,index]), min(medianstats[:,index]))-timedelta, max(max(meanstats[:,index]), max(medianstats[:,index]))+timedelta)
    plt.savefig(outputfilenames[index-1],format='pdf')
def main():
  trainpercent = 60
  isRandomSplit = False

  filenames = ['data-classification-prob.csv', 'data-ripple-prob.csv']
  outputfilenames = ['plot1.pdf', 'plot2.pdf']
  trainfilenames = ['traintime1.pdf', 'traintime2.pdf']
  testfilenames = ['testtime1.pdf', 'testtime2.pdf']
  methods = ['mean','median']
  
  for index in range(2):
    #read data from data file
    input = np.loadtxt(filenames[index], delimiter=',')
    trainsize = math.floor(input.shape[0]*trainpercent/100)
  
    #split data into train and test sets 
    Xtrain = input[0:trainsize,:-1]
    Ytrain = input[0:trainsize,-1]
    Xtest = input[trainsize:,:-1]
    Ytest = input[trainsize:,-1]
  
    MAXK = 300
    NUMCOLS = 4
    
    meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    
    avgtraintime = -1
    avgtesttime = -1
  
    for method in methods:
      stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
      bestcorr = -1000
      bestK = -1
      
      for k in range(1, MAXK+1):
        #instantiate learner and test
        learner = KNNLearner(k, method)
      
        #get start time
        trainstarttime = dt.datetime.now()
        learner.addEvidence(Xtrain, Ytrain)
        #get end time and print total time for adding evidnece
        trainendtime = dt.datetime.now()
      
        #get start time
        teststarttime = dt.datetime.now()
        Y = learner.query(Xtest)
        #get end time and print total time for testing
        testendtime = dt.datetime.now()
    
        #compute corrcoef
        corr = np.corrcoef(Ytest.T, Y.T)
        if corr[0,1] > bestcorr:
          bestcorr = corr[0,1]
          bestK = k
      
        stats[k-1, 0] = k
        stats[k-1, 1] = corr[0,1]
        #The total_seconds() method works in python >= 2.7
        #stats[k-1, 2] = (trainendtime - trainstarttime).total_seconds()/Xtrain.shape[0]
        #stats[k-1, 3] = (testendtime - teststarttime).total_seconds()/Xtest.shape[0]
        stats[k-1, 2] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0]
        stats[k-1, 3] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0]
      
        if k == 3 and method == 'mean':
          avgtraintime = stats[k-1,2]
          avgtesttime = stats[k-1,3]

      print 'File:%s Method:%s BestCorrelation:%f K corresponding to best correlation:%f AvgTrainTimeForK3Mean :%f seconds AvgTestTimeForK3Mean:%f seconds'%(filenames[index], method, bestcorr, bestK, avgtraintime, avgtesttime)
    
      if method == 'median':
        medianstats = stats.copy()
      else: 
        meanstats = stats.copy()
    
    timedelta = 1

    #Graph for k versus corrcoef
    plt.cla()
    plt.clf()
    plt.plot(meanstats[:,0], meanstats[:,1], color='r')
    plt.plot(medianstats[:,0], medianstats[:,1], color='b')
    plt.legend(('method=mean', 'method=median'), loc='upper right')
    plt.ylabel('Correlation Coefficient')
    plt.xlabel('k')
    plt.savefig(outputfilenames[index],format='pdf')
Ejemplo n.º 5
0
def testlearner():
	'''
	test KNN and Linear regression learner
	'''

	Xdcp, Ydcp = _csv_read("data-classification-prob.csv")
	Xdrp, Ydrp = _csv_read("data-ripple-prob.csv") # the data in numpy array now is string instead of float
	
	#divide data for train and test
	dcp_row_N = Xdcp.shape[0]
	drp_row_N = Xdrp.shape[0]
	trainperct = 0.6 # data for training is 60% of total data
	dcp_trp = int(dcp_row_N * trainperct)
	drp_trp = int(drp_row_N * trainperct)
	#testperct = 1.0 - trainperct # data for test's percent 
	#data for training
	Xdcp_train = Xdcp[0:dcp_trp, :]
	Ydcp_train = np.zeros([dcp_trp, 1])
	Ydcp_train[:, 0] = Ydcp[0:dcp_trp]
	Xdrp_train = Xdrp[0:drp_trp, :]
	Ydrp_train = np.zeros([drp_trp, 1])
	Ydrp_train[:, 0] = Ydrp[0:drp_trp]
	#data for test (query)
	Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :]
	Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1])
	Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N]
	#Ydcp_test = [:, 0:col_n] = Xdata
	Xdrp_test = Xdrp[drp_trp:drp_row_N, :]
	Ydrp_test = np.zeros([drp_row_N - drp_trp, 1])
	Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N]

	

	#KNN learner

	# result of KNN learn, rows records k, training time cost, query time cost, total time cost, RMSError and Correlation coeffient
	KNN_dcp_result = np.zeros([7, 50]) # result of data-classification-prob.csv
	KNN_drp_result = np.zeros([7, 50]) # result of data-ripple-prob.csv

	for k in range(1, 51):
		KNN_lner = KNNLearner(k)
		KNN_dcp_result[0][k-1] = k
		KNN_drp_result[0][k-1] = k
		
		# results of data-classification-prob.csv
		stime = time.time()
		KNN_lner.addEvidence(Xdcp_train, Ydcp_train)
		etime = time.time()
		KNN_dcp_result[1][k-1] = (etime - stime) / dcp_trp # training time cost

		stime = time.time()
		Ydcp_learn = KNN_lner.query(Xdcp_test)
		etime = time.time()
		KNN_dcp_result[2][k-1] = (etime - stime) / (dcp_row_N - dcp_trp) # query time cost

		KNN_dcp_result[3][k-1] = KNN_dcp_result[1][k-1] + KNN_dcp_result[2][k-1] # total time cost
		
		#print Ydcp_test
		#print Ydcp_learn
		KNN_dcp_result[4][k-1] = RMSE(Ydcp_test, Ydcp_learn) # Root-Mean-square error

		KNN_dcp_result[5][k-1] = np.corrcoef(Ydcp_learn.T, Ydcp_test.T)[0][1] # correlation coefficient

		Ydcp_osp = KNN_lner.query(Xdcp_train)
		KNN_dcp_result[6][k-1] = RMSE(Ydcp_train, Ydcp_osp) # the RMS error between in-sample and out-sample data
		
		# results of data-ripple-prob.csv
		stime = time.time()
		KNN_lner.addEvidence(Xdrp_train, Ydrp_train)
		etime = time.time()
		KNN_drp_result[1][k-1] = (etime - stime) / drp_trp # training time cost

		stime = time.time()
		Ydrp_learn = KNN_lner.query(Xdrp_test)
		etime = time.time()
		KNN_drp_result[2][k-1] = (etime - stime) / (drp_row_N - drp_trp) # query time cost

		KNN_drp_result[3][k-1] = KNN_drp_result[1][k-1] + KNN_drp_result[2][k-1] # total time cost

		KNN_drp_result[4][k-1] = RMSE(Ydrp_test, Ydrp_learn) # Root-Mean-Square error

		KNN_drp_result[5][k-1] = np.corrcoef(Ydrp_learn.T, Ydrp_test.T)[0][1] # correlation coefficient

		# insample and outsample error of ripple
		Ydrp_osp = KNN_lner.query(Xdrp_train)
		KNN_drp_result[6][k-1] = RMSE(Ydrp_train, Ydrp_osp) # the RMS error between in-sample and out-sample data

		#plot the predicted Y vesus actual Y when K = 3
		if k == 27:
			# plot the Y data of classification data
			plt.clf()
			fig = plt.figure()
			fig.suptitle('Y of classification data')
			#f1 = fig.add_subplot(2, 1, 1)
			plt.plot(Ydcp_test, Ydcp_learn, 'o', markersize = 5)
			plt.xlabel('Actual Y')
			plt.ylabel('Predicted Y')
			#f1.set_title('data-classcification-prob.csv')
			fig.savefig('classification_Y.pdf', format = 'pdf')

		if k == 3:
			# plot the Y data of ripple data
			#f2 = fig.add_subplot(2, 1, 2)
			plt.clf()
			fig = plt.figure()
			fig.suptitle('Y of ripple data')
			plt.plot(Ydrp_test, Ydrp_learn, 'o', markersize = 5)
			plt.xlabel('Actual Y')
			plt.ylabel('Predicted Y')
			#f2.set_title('data-ripple-prob.csv')
			fig.savefig('ripple_Y.pdf', format = 'pdf')

	print KNN_dcp_result[:, 2] #the result of k=3 for dcp.csv
	Kdcp_best_pos = np.argmax(KNN_dcp_result[5, :])	#the indices of the maximum correlation coeffiecient
	print KNN_dcp_result[:, Kdcp_best_pos]

	print KNN_drp_result[:, 2] #the result of k=3 for drp.csv
	Kdrp_best_pos = np.argmax(KNN_drp_result[5, :]) #the indices of the maximum correlation
	print KNN_drp_result[:, Kdrp_best_pos]

	#plot the correlation
	plt.clf()
	fig = plt.figure()
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[5, :], 'r', label = 'Classification')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[5, :], 'b', label = 'Ripple')
	plt.legend()
	plt.xlabel('K')
	plt.ylabel('Correlation Coefficient')
	fig.savefig('Correlation_KNN.pdf', format = 'pdf')

	#plot the error between in sample and out-of-sample data
	plt.clf()
	fig = plt.figure()
	#f1 = fig.add_subplot(2, 1, 1)
	fig.suptitle('RMS error of classification data')
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[4, :], 'or', label = 'out of sample')
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[6, :], 'ob', label = 'in sample')
	#f1.axis([0:0.1:1.0]
	plt.legend(loc = 4)
	plt.xlabel('K')
	plt.ylabel('RMS Error')

	fig.savefig('classification-RMSE.pdf', format = 'pdf')
	#f1.set_title('data-classification-prob.csv')
	
	#f2 = fig.add_subplot(2, 1, 2)
	plt.clf()
	fig = plt.figure()
	fig.suptitle('RMS error of ripple data')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[4, :], 'or', label = 'out of sample')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[6, :], 'ob', label = 'in sample')
	#f2.axis([0:0.1:1.0]
	plt.legend(loc = 4)
	plt.xlabel('K')
	plt.ylabel('RMS Error')
	#f2.set_title('data-ripple-prob.csv')
	plt.savefig('ripple-RMSE.pdf', format = 'pdf')

	# plot the train time
	plt.clf()
	fig = plt.figure()
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[1, :], 'r', label = 'Classification')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[1, :], 'b', label = 'Ripple')
	plt.legend(loc=1)
	plt.xlabel('K')
	plt.ylabel('train time / s')
	fig.savefig('traintime.pdf', format = 'pdf')

	# plot the query time
	plt.clf()
	fig = plt.figure()
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[2, :], 'r', label = 'Classification')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[2, :], 'b', label = 'Ripple')
	plt.legend(loc=4)
	plt.xlabel('K')
	plt.ylabel('query time / s')
	fig.savefig('querytime.pdf', format = 'pdf')
	


	# Linear regression
	LR_lner = LinRegLearner()
	LR_dcp_result = np.zeros(5)	#Linear regression results of data-classification-prob.csv
	LR_drp_result = np.zeros(5) #Linear regression results of data-ripple-prob.csv
	
	# results of data-classification-prob.csv
	stime = time.time()
	dcp_cof = LR_lner.addEvidence(Xdcp_train, Ydcp_train)
	etime = time.time()
	LR_dcp_result[0] = (etime - stime) / dcp_trp# train time cost

	stime = time.time()
	Ydcp_LRL = LR_lner.query(Xdcp_test, dcp_cof)
	etime = time.time()
	LR_dcp_result[1] = (etime - stime) / (dcp_row_N - dcp_trp) # query time cost

	LR_dcp_result[2] = LR_dcp_result[0] + LR_dcp_result[1] # total time cost

	LR_dcp_result[3] = RMSE(Ydcp_test, Ydcp_LRL) # root-mean-square error

	LR_dcp_result[4] = np.corrcoef(Ydcp_test.T, Ydcp_LRL.T)[0][1] # correlation efficient

	print LR_dcp_result

	# results of data-ripple-prob.csv
	stime = time.time()
	drp_cof = LR_lner.addEvidence(Xdrp_train, Ydrp_train)
	etime = time.time()
	LR_drp_result[0] = (etime - stime) / drp_trp # train time cost

	stime = time.time()
	Ydrp_LRL = LR_lner.query(Xdrp_test, drp_cof)
	etime = time.time()
	LR_drp_result[1] = (etime - stime) / (drp_row_N - drp_trp) # query time cost

	LR_drp_result[2] = LR_drp_result[0] + LR_drp_result[1] # total time cost

	LR_drp_result[3] = RMSE(Ydrp_test, Ydrp_LRL) # root-mean-square error

	LR_drp_result[4] = np.corrcoef(Ydrp_test.T, Ydrp_LRL.T)[0][1] # correlation efficient
	
	print LR_drp_result
def testlearner():
	'''
	test Random forest and compare with KNN
	'''

	Xdcp, Ydcp = _csv_read("data-classification-prob.csv")
	Xdrp, Ydrp = _csv_read("data-ripple-prob.csv") # the data in numpy array now is string instead of float
	
	#divide data for train and test
	dcp_row_N = Xdcp.shape[0]
	drp_row_N = Xdrp.shape[0]
	trainperct = 0.6 # data for training is 60% of total data
	dcp_trp = int(dcp_row_N * trainperct)
	drp_trp = int(drp_row_N * trainperct)
	#testperct = 1.0 - trainperct # data for test's percent 
	#data for training
	Xdcp_train = Xdcp[0:dcp_trp, :]
	Ydcp_train = np.zeros([dcp_trp, 1])
	Ydcp_train[:, 0] = Ydcp[0:dcp_trp]
	Xdrp_train = Xdrp[0:drp_trp, :]
	Ydrp_train = np.zeros([drp_trp, 1])
	Ydrp_train[:, 0] = Ydrp[0:drp_trp]
	#data for test (query)
	Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :]
	Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1])
	Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N]
	#Ydcp_test = [:, 0:col_n] = Xdata
	Xdrp_test = Xdrp[drp_trp:drp_row_N, :]
	Ydrp_test = np.zeros([drp_row_N - drp_trp, 1])
	Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N]

	#print Xdcp_train

	# result of KNN learn, rows records k, training time cost, query time cost, RMSError and Correlation coeffient
	DT_dcp_result = np.zeros([5, 100]) # result of data-classification-prob.csv of RF
	DT_drp_result = np.zeros([5, 100]) # result of data-ripple-prob.csv of RF
	KNN_dcp_result = np.zeros([2, 100]) # results of data-classification-prob.csv of KNN
	KNN_drp_result = np.zeros([2, 100]) # results of data-ripple-prob.csv of KNN

	#print len(RFL.trees)
	for k in range(1, 101):
		#k = 30
		# Random forest learner
		RFL = RandomForestLearner(k)
		KNN_lner = KNNLearner(k)
		
		DT_dcp_result[0][k-1] = k
		DT_drp_result[0][k-1] = k
		# result of data-classification-prob
		stime = time.time()
		RFL.addEvidence(Xdcp_train, Ydcp_train)
		etime = time.time()
		DT_dcp_result[1][k-1] = etime - stime

		KNN_lner.addEvidence(Xdcp_train, Ydcp_train)

		#print len(RFL.trees)
		#RFL.trees[0].print_tree(RFL.trees[0].root)
		stime = time.time()
		Ydcp_learn = RFL.query(Xdcp_test)
		etime = time.time()
		DT_dcp_result[2][k-1] = etime - stime;

		Ydcp_learn_KNN = KNN_lner.query(Xdcp_test)

		DT_dcp_result[3][k-1] = RMSE(Ydcp_learn, Ydcp_test)
		KNN_dcp_result[0][k-1] = RMSE(Ydcp_learn_KNN, Ydcp_test)

		DT_dcp_result[4][k-1] = np.corrcoef(Ydcp_learn.T, Ydcp_test.T)[0][1]
		KNN_dcp_result[1][k-1] = np.corrcoef(Ydcp_learn_KNN.T, Ydcp_test.T)[0][1]

		# result of data-ripple
		#RFL1 = RandomForestLearner(k)
		stime = time.time()
		RFL.addEvidence(Xdrp_train, Ydrp_train)
		etime = time.time()
		DT_drp_result[1][k-1] = etime - stime

		KNN_lner.addEvidence(Xdrp_train, Ydrp_train)

		#print len(RFL.trees)
		#RFL.trees[0].print_tree(RFL.trees[0].root)
		stime = time.time()
		Ydrp_learn = RFL.query(Xdrp_test)
		etime = time.time()
		DT_drp_result[2][k-1] = etime - stime;

		Ydrp_learn_KNN = KNN_lner.query(Xdrp_test)

		#print Ydrp_learn_KNN

		DT_drp_result[3][k-1] = RMSE(Ydrp_learn, Ydrp_test)
		KNN_drp_result[0][k-1] = RMSE(Ydrp_learn_KNN, Ydrp_test)

		DT_drp_result[4][k-1] = np.corrcoef(Ydrp_learn.T, Ydrp_test.T)[0][1]
		KNN_drp_result[1][k-1] = np.corrcoef(Ydrp_learn_KNN.T, Ydrp_test.T)[0][1]
		#print DT_drp_result[4][k-1]
	
	plt.clf()
	fig = plt.figure()
	fig.suptitle('RMS Error of Classification data test')
	plt.plot(DT_dcp_result[0, :], DT_dcp_result[3, :], 'r', label = 'Random Forest')
	plt.plot(DT_dcp_result[0, :], KNN_dcp_result[0, :], 'b', label = 'KNN')
	plt.legend(loc = 1)
	plt.xlabel('K')
	plt.ylabel('RMS Error')
	fig.savefig('classification-RMSE.pdf', format = 'pdf')

	plt.clf()
	fig = plt.figure()
	fig.suptitle('Correlation Coefficient of Classification data test')
	plt.plot(DT_dcp_result[0, :], DT_dcp_result[4, :], 'r', label = 'Random Forest')
	plt.plot(DT_dcp_result[0, :], KNN_dcp_result[1, :], 'b', label = 'KNN')
	plt.legend(loc = 4)
	plt.xlabel('K')
	plt.ylabel('Correlation Coefficient')
	fig.savefig('classification-Corr.pdf', format = 'pdf')

	plt.clf()
	fig = plt.figure()
	fig.suptitle('RMS Error of Ripple data test')
	plt.plot(DT_drp_result[0, :], DT_drp_result[3, :], 'r', label = 'Random Forest')
	plt.plot(DT_drp_result[0, :], KNN_drp_result[0, :], 'b', label = 'KNN')
	plt.legend(loc = 2)
	plt.xlabel('K')
	plt.ylabel('RMS Error')
	fig.savefig('ripple-RMSE.pdf', format = 'pdf')

	plt.clf()
	fig = plt.figure()
	fig.suptitle('Correlation Coefficient of Ripple data test')
	plt.plot(DT_drp_result[0, :], DT_drp_result[4, :], 'r', label = 'Random Forest')
	plt.plot(DT_drp_result[0, :], KNN_drp_result[1, :], 'b', label = 'KNN')
	plt.legend(loc = 3)
	plt.xlabel('K')
	plt.ylabel('Correlation Coefficient')
	fig.savefig('ripple-Corr.pdf', format = 'pdf')
Ejemplo n.º 7
0
def main():
    trainpercent = 60
    methods = ['mean', 'median']

    #read data from data file
    input = np.loadtxt('data-ripple-prob.csv', delimiter=',')
    trainsize = math.floor(input.shape[0] * trainpercent / 100)

    #split data into train and test sets
    Xtrain = input[0:trainsize, :-1]
    Ytrain = input[0:trainsize, -1]
    Xtest = input[trainsize:, :-1]
    Ytest = input[trainsize:, -1]

    MAXK = 30
    NUMCOLS = 5

    meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)

    for method in methods:
        stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)

        for k in range(1, MAXK + 1):
            #instantiate learner and test
            learner = KNNLearner(k, method)

            #get start time
            trainstarttime = dt.datetime.now()
            learner.addEvidence(Xtrain, Ytrain)
            #get end time and print total time for adding evidnece
            trainendtime = dt.datetime.now()

            #get start time
            teststarttime = dt.datetime.now()
            Y = learner.query(Xtest)
            #get end time and print total time for testing
            testendtime = dt.datetime.now()

            stats[k - 1, 0] = k
            stats[k - 1, 1] = gettotalseconds(trainstarttime,
                                              trainendtime) / Xtrain.shape[0]
            stats[k - 1, 2] = gettotalseconds(teststarttime,
                                              testendtime) / Xtest.shape[0]

            kdtlearner = kdtknn(k, method)
            #get start time
            trainstarttime = dt.datetime.now()
            kdtlearner.addEvidence(Xtrain, Ytrain)
            #get end time and print total time for adding evidnece
            trainendtime = dt.datetime.now()

            #get start time
            teststarttime = dt.datetime.now()
            Y = kdtlearner.query(Xtest)
            #get end time and print total time for testing
            testendtime = dt.datetime.now()

            stats[k - 1, 3] = gettotalseconds(trainstarttime,
                                              trainendtime) / Xtrain.shape[0]
            stats[k - 1, 4] = gettotalseconds(teststarttime,
                                              testendtime) / Xtest.shape[0]

        if method == 'median':
            medianstats = stats.copy()
        else:
            meanstats = stats.copy()

    #Graph for time/instance versus corrcoef
    timedelta = 0.001
    outputfilenames = [
        'mytraining.pdf', 'myquery.pdf', 'kdtknntraining.pdf',
        'kdtknnquery.pdf'
    ]
    titles = [
        'mytrainingtime/instance', 'myquerytime/instance',
        'kdtknntrainingtime/instance', 'kdtknnquerytime/instance'
    ]
    for index in range(1, NUMCOLS):
        plt.cla()
        plt.clf()
        plt.plot(meanstats[:, 0], meanstats[:, index], color='r')
        plt.plot(medianstats[:, 0], medianstats[:, index], color='b')
        plt.legend(('method=mean', 'method=median'), loc='upper right')
        plt.ylabel(titles[index - 1])
        plt.xlabel('k')
        plt.ylim(
            min(min(meanstats[:, index]), min(medianstats[:, index])) -
            timedelta,
            max(max(meanstats[:, index]), max(medianstats[:, index])) +
            timedelta)
        plt.savefig(outputfilenames[index - 1], format='pdf')