Esempio n. 1
0
def _read_data(filename):
    with util.get_learner_data_file(filename) as f:
        alldata = np.genfromtxt(f, delimiter=',')

        if filename == 'Istanbul.csv':
            # Skip the date column (first) and header row (first)
            return alldata[1:, 1:]

        return alldata
def get_rmse(n_leaves):

    inf = 'Istanbul.csv'

    rmse_train, rmse_test = [], []
    percents = np.arange(0.05,0.8,0.05)
    for i in percents:
        with util.get_learner_data_file(inf) as f:
            alldata = np.genfromtxt(f,delimiter=',')
            # Skip the date column and header row if we're working on Istanbul data
            alldata = alldata[1:,1:]
            datasize = alldata.shape[0]
            cutoff = int(datasize * i)
            permutation = np.random.permutation(alldata.shape[0])
            col_permutation = np.random.permutation(alldata.shape[1]-1)
            train_data = alldata[permutation[:cutoff],:]
            trainX = train_data[:,col_permutation]
            trainY = train_data[:,-1]
            test_data = alldata[permutation[cutoff:],:]
            testX = test_data[:,col_permutation]
            testY = test_data[:,-1]

            n = 10
        #times_dt, times_rt = [], []
        #for n in range(1, n_leaves + 1):
            #learner = bl.BagLearner(learner=dtl.DTLearner, kwargs={"leaf_size":n}, bags=20)
            #learner = il.InsaneLearner(verbose=False)

            # rtlearner = rtl.RTLearner(leaf_size=n)
            # t0 = time.time()
            # rtlearner.addEvidence(trainX, trainY) # train it
            # t1 = time.time()
            # times_rt.append(t1-t0)

            learner = rtl.RTLearner(leaf_size=n)
            t2 = time.time()
            learner.addEvidence(trainX, trainY)
            t3 = time.time()
            #times_dt.append(t3-t2)

            # evaluate in sample
            predY = learner.query(trainX) # get the predictions
            rmse_1 = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
            rmse_train.append(rmse_1)
            c = np.corrcoef(predY, y=trainY)

            # evaluate out of sample
            predY = learner.query(testX) # get the predictions
            rmse_2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
            rmse_test.append(rmse_2)
            c = np.corrcoef(predY, y=testY)

    return rmse_train, rmse_test, percents
Esempio n. 3
0
def get_data_dict(dataName, portion = False, shuffle = False):
    with util.get_learner_data_file(dataName) as f:  		   	  			    		  		  		    	 		 		   		 		  
        data = np.genfromtxt(f,delimiter=',')  		   	  			    		  		  		    	 		 		   		 		  
        # Skip the date column and header row if we're working on Istanbul data  		   	  			    		  		  		    	 		 		   		 		  
        if dataName == 'Istanbul.csv':  		   	  			    		  		  		    	 		 		   		 		  
            data = data[1:,1:]
    # compute how much of the data is training and testing  
    if shuffle:		   
        np.random.shuffle(data)
    
    myData = {}	  			    		  		  		    	 		 		   		 		  
    train_rows = int(0.6* data.shape[0]) 
    if portion:
        data = data[:int(portion*data.shape[0])]
          	
    train_rows = int(0.6* data.shape[0])		  		  		    	 		 		   		 		  
    myData['trainX'] = data[:train_rows,0:-1]  		   	  			    		  		  		    	 		 		   		 		  
    myData['trainY'] = data[:train_rows,-1]  		   	  			    		  		  		    	 		 		   		 		  
    myData['testX'] = data[train_rows:,0:-1]  		   	  			    		  		  		    	 		 		   		 		  
    myData['testY'] = data[train_rows:,-1]
    return myData
def test_learners(description, group, datafile, seed, outputs, grader):
    """Test ML models returns correct predictions. 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
    Requires test description, test case group, inputs, expected outputs, and a grader fixture. 			  		 			 	 	 		 		 	  		   	  			  	
    """

    points_earned = 0.0  # initialize points for this test case
    try:
        learner_class = None
        kwargs = {'verbose': False}

        # (BPH) Copied from grade_strategy_qlearning.py
        #Set fixed seed for repetability
        np.random.seed(seed)
        random.seed(seed)
        #remove ability to seed either np.random or python random
        tmp_numpy_seed = np.random.seed
        tmp_random_seed = random.seed
        np.random.seed = fake_seed
        random.seed = fake_rseed

        # Try to import KNNLearner (only once)
        # if not 'KNNLearner' in globals():
        #     from KNNLearner import KNNLearner
        if not 'RTLearner' in globals():
            from RTLearner import RTLearner
        if not 'DTLearner' in globals():
            from DTLearner import DTLearner
        if (group is 'BagLearner') or (group is 'InsaneLearner') or (
                group is 'RandomName') and (not 'BagLearner' in globals()):
            from BagLearner import BagLearner
        #put seeds back for the moment
        np.random.seed = tmp_numpy_seed
        random.seed = tmp_random_seed
        # Tweak kwargs
        # kwargs.update(inputs.get('kwargs', {}))

        # Read separate training and testing data files
        # with open(inputs['train_file']) as f:
        # data_partitions=list()
        testX, testY, trainX, trainY = None, None, None, None
        permutation = None
        author = None
        with util.get_learner_data_file(datafile) as f:
            alldata = np.genfromtxt(f, delimiter=',')
            # Skip the date column and header row if we're working on Istanbul data
            if datafile == 'Istanbul.csv':
                alldata = alldata[1:, 1:]
            datasize = alldata.shape[0]
            cutoff = int(datasize * 0.6)
            permutation = np.random.permutation(alldata.shape[0])
            col_permutation = np.random.permutation(alldata.shape[1] - 1)
            train_data = alldata[permutation[:cutoff], :]
            # trainX = train_data[:,:-1]
            trainX = train_data[:, col_permutation]
            trainY = train_data[:, -1]
            test_data = alldata[permutation[cutoff:], :]
            # testX = test_data[:,:-1]
            testX = test_data[:, col_permutation]
            testY = test_data[:, -1]
        msgs = []

        if (group is "RTLearner") or (group is "DTLearner"):
            clss_name = RTLearner if group is "RTLearner" else DTLearner
            tree_sptc = 3 if group is "RTLearner" else 10
            corr_in, corr_out, corr_in_50 = None, None, None

            def oneleaf():
                np.random.seed(seed)
                random.seed(seed)
                np.random.seed = fake_seed
                random.seed = fake_rseed
                learner = clss_name(leaf_size=1, verbose=False)
                learner.addEvidence(trainX, trainY)
                insample = learner.query(trainX)
                outsample = learner.query(testX)
                np.random.seed = tmp_numpy_seed
                random.seed = tmp_random_seed
                author_rv = None
                try:
                    author_rv = learner.author()
                except:
                    pass
                return insample, outsample, author_rv

            def fiftyleaves():
                np.random.seed(seed)
                random.seed(seed)
                np.random.seed = fake_seed
                random.seed = fake_rseed
                learner = clss_name(leaf_size=50, verbose=False)
                learner.addEvidence(trainX, trainY)
                np.random.seed = tmp_numpy_seed
                random.seed = tmp_random_seed
                return learner.query(trainX)

            predY_in, predY_out, author = run_with_timeout(
                oneleaf, tree_sptc, (), {})
            predY_in_50 = run_with_timeout(fiftyleaves, tree_sptc, (), {})
            corr_in = np.corrcoef(predY_in, y=trainY)[0, 1]
            corr_out = np.corrcoef(predY_out, y=testY)[0, 1]
            corr_in_50 = np.corrcoef(predY_in_50, y=trainY)[0, 1]
            incorrect = False

            if corr_in < outputs['insample_corr_min'] or np.isnan(corr_in):
                incorrect = True
                msgs.append(
                    "    In-sample with leaf_size=1 correlation less than allowed: got {} expected {}"
                    .format(corr_in, outputs['insample_corr_min']))
            else:
                points_earned += 1.0
            if corr_out < outputs['outsample_corr_min'] or np.isnan(corr_out):
                incorrect = True
                msgs.append(
                    "    Out-of-sample correlation less than allowed: got {} expected {}"
                    .format(corr_out, outputs['outsample_corr_min']))
            else:
                points_earned += 1.0
            if corr_in_50 > outputs['insample_corr_max'] or np.isnan(
                    corr_in_50):
                incorrect = True
                msgs.append(
                    "    In-sample correlation with leaf_size=50 greater than allowed: got {} expected {}"
                    .format(corr_in_50, outputs['insample_corr_max']))
            else:
                points_earned += 1.0
            # Check author string
            if (author is None) or (author == 'tb34'):
                incorrect = True
                msgs.append("    Invalid author: {}".format(author))
                points_earned += -2.0

        elif group is "BagLearner":
            corr1, corr20 = None, None
            bag_sptc = 10

            def onebag():
                np.random.seed(seed)
                random.seed(seed)
                np.random.seed = fake_seed
                random.seed = fake_rseed
                learner1 = BagLearner(learner=RTLearner,
                                      kwargs={"leaf_size": 1},
                                      bags=1,
                                      boost=False,
                                      verbose=False)
                learner1.addEvidence(trainX, trainY)
                q_rv = learner1.query(testX)
                a_rv = learner1.author()
                np.random.seed = tmp_numpy_seed
                random.seed = tmp_random_seed
                return q_rv, a_rv

            def twentybags():
                np.random.seed(seed)
                random.seed(seed)
                np.random.seed = fake_seed
                random.seed = fake_rseed
                learner20 = BagLearner(learner=RTLearner,
                                       kwargs={"leaf_size": 1},
                                       bags=20,
                                       boost=False,
                                       verbose=False)
                learner20.addEvidence(trainX, trainY)
                q_rv = learner20.query(testX)
                np.random.seed = tmp_numpy_seed
                random.seed = tmp_random_seed
                return q_rv

            predY1, author = run_with_timeout(onebag,
                                              bag_sptc,
                                              pos_args=(),
                                              keyword_args={})
            predY20 = run_with_timeout(twentybags, bag_sptc, (), {})

            corr1 = np.corrcoef(predY1, testY)[0, 1]
            corr20 = np.corrcoef(predY20, testY)[0, 1]
            incorrect = False
            # msgs = []
            if corr20 <= corr1:
                incorrect = True
                msgs.append(
                    "    Out-of-sample correlation for 20 bags is not greater than for 1 bag. 20 bags:{}, 1 bag:{}"
                    .format(corr20, corr1))
            else:
                points_earned += 2.0
            # Check author string
            if (author is None) or (author == 'tb34'):
                incorrect = True
                msgs.append("    Invalid author: {}".format(author))
                points_earned += -1.0
        elif group is "InsaneLearner":
            try:

                def insane():
                    import InsaneLearner as it
                    learner = it.InsaneLearner(verbose=False)
                    learner.addEvidence(trainX, trainY)
                    Y = learner.query(testX)

                run_with_timeout(insane, 10, pos_args=(), keyword_args={})
                incorrect = False
            except Exception as e:
                incorrect = True
                msgs.append(
                    "    Exception calling InsaneLearner: {}".format(e))
                points_earned = -10
        elif group is "RandomName":
            try:
                il_name, il_code = gen_class()
                exec(il_code) in globals(), locals()
                il_cobj = eval(il_name)

                def rnd_name():
                    np.random.seed(seed)
                    random.seed(seed)
                    np.random.seed = fake_seed
                    random.seed = fake_rseed
                    learner = BagLearner(learner=il_cobj,
                                         kwargs={'verbose': False},
                                         bags=20,
                                         boost=False,
                                         verbose=False)
                    learner.addEvidence(trainX, trainY)
                    Y = learner.query(testX)
                    np.random.seed = tmp_numpy_seed
                    random.seed = tmp_random_seed
                    return il_cobj.init_callcount_dict, il_cobj.add_callcount_dict, il_cobj.query_callcount_dict

                iccd, accd, qccd = run_with_timeout(rnd_name,
                                                    10,
                                                    pos_args=(),
                                                    keyword_args={})
                incorrect = False
                if (len(iccd) != 20) or (any([v != 1 for v in iccd.values()])):
                    incorrect = True
                    msgs.append(
                        "    Unexpected number of calls to __init__, sum={} (should be 20), max={} (should be 1), min={} (should be 1)"
                        .format(len(iccd), max(iccd.values()),
                                min(iccd.values())))
                    points_earned = -10
                if (len(accd) != 20) or (any([v != 1 for v in accd.values()])):
                    incorrect = True
                    msgs.append(
                        "    Unexpected number of calls to addEvidence sum={} (should be 20), max={} (should be 1), min={} (should be 1)"
                        .format(len(accd), max(accd.values()),
                                min(accd.values())))
                    points_earned = -10
                if (len(qccd) != 20) or (any([v != 1 for v in qccd.values()])):
                    incorrect = True
                    msgs.append(
                        "    Unexpected number of calls to query, sum={} (should be 20), max={} (should be 1), min={} (should be 1)"
                        .format(len(qccd), max(qccd.values()),
                                min(qccd.values())))
                    points_earned = -10
            except Exception as e:
                incorrect = True
                msgs.append("   Exception calling BagLearner: {}".format(e))
                points_earned = -10
        if incorrect:
            inputs_str = "    data file: {}\n" \
                         "    permutation: {}".format(datafile, permutation)
            raise IncorrectOutput, "Test failed on one or more output criteria.\n  Inputs:\n{}\n  Failures:\n{}".format(
                inputs_str, "\n".join(msgs))
    except Exception as e:
        # Test result: failed
        msg = "Description: {} (group: {})\n".format(description, group)

        # Generate a filtered stacktrace, only showing erroneous lines in student file(s)
        tb_list = tb.extract_tb(sys.exc_info()[2])
        for i in xrange(len(tb_list)):
            row = tb_list[i]
            tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3]
                          )  # show only filename instead of long absolute path
        tb_list = [
            row for row in tb_list
            if (row[0] == 'RTLearner.py') or (row[0] == 'BagLearner.py')
        ]
        if tb_list:
            msg += "Traceback:\n"
            msg += ''.join(tb.format_list(tb_list))  # contains newlines
        msg += "{}: {}".format(e.__class__.__name__, e.message)

        # Report failure result to grader, with stacktrace
        grader.add_result(
            GradeResult(outcome='failed', points=points_earned, msg=msg))
        raise
    else:
        # Test result: passed (no exceptions)
        grader.add_result(
            GradeResult(outcome='passed', points=points_earned, msg=None))
def generate_plots(datafile='Istanbul.csv'):
    np.random.seed(10)
    print " Plotting graphs for " + str(datafile)
    with util.get_learner_data_file(datafile) as f:
        alldata = np.genfromtxt(f, delimiter=',')
        # Skip the date column and header row if we're working on Istanbul data
        if datafile == 'Istanbul.csv':
            alldata = alldata[1:, 1:]
    data = alldata
    # print data.shape[0]
    data = np.random.permutation(data)
    # compute how much of the data is training and testing
    np.random.shuffle(data)
    train_rows = int(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows
    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    rmses_in = np.empty(100)
    rmses_in.fill(-1)
    rmses_out = np.empty(100)
    rmses_out.fill(-1)

    dt_learner_tree_size = np.empty(100)
    dt_learner_tree_size.fill(-1)
    dt_learner_tree_time = np.empty(100)
    dt_learner_tree_time.fill(-1)

    leaf_size = range(100)
    leaf_plot = range(1, 101)
    for i in leaf_size:
        learner = dt.DTLearner(leaf_size=i + 1, verbose=False)  # constructor
        rmses_in[i], rmses_out[i], dt_learner_tree_size[
            i], dt_learner_tree_time[i] = run_learner(learner, trainX, trainY,
                                                      testX,
                                                      testY)  # training step
    plt.plot(leaf_plot, rmses_in, label="In Sample")
    plt.plot(leaf_plot, rmses_out, label="Out Sample")
    plt.legend(loc="best")
    plt.grid(True)
    plt.title('DTLearner overfitting with increasing leaf size.', fontsize=10)
    plt.ylabel('RMSE')
    plt.xlabel('Leaf Size')
    plt.savefig('DTLearner overfitting with increasing leaf size.')
    plt.clf()
    plt.cla()
    plt.close()

    rmses_in_2 = np.empty(100)
    rmses_in_2.fill(-1)
    rmses_out_2 = np.empty(100)
    rmses_out_2.fill(-1)
    for i in leaf_size:
        kwargs = {'leaf_size': i + 1}
        learner = bl.BagLearner(learner=dt.DTLearner,
                                kwargs=kwargs,
                                verbose=False,
                                bags=20)  # constructor
        rmses_in_2[i], rmses_out_2[i], ignore, ignore_2 = run_learner(
            learner, trainX, trainY, testX, testY)  # training step
    plt.plot(leaf_plot, rmses_in_2, label="In Sample")
    plt.plot(leaf_plot, rmses_out_2, label="Out Sample")
    plt.legend(loc="best")
    plt.grid(True)
    plt.title('Baglearner at 20 bags overfitting with increasing leaf size.',
              fontsize=10)
    plt.ylabel('RMSE')
    plt.xlabel('Leaf Size')
    plt.savefig('Baglearner at 20 bags overfitting with increasing leaf size.')
    plt.clf()
    plt.cla()
    plt.close()

    rmses_in_3 = np.empty(100)
    rmses_in_3.fill(-1)
    rmses_out_3 = np.empty(100)
    rmses_out_3.fill(-1)

    rt_learner_tree_size = np.empty(100)
    rt_learner_tree_size.fill(-1)
    rt_learner_tree_time = np.empty(100)
    rt_learner_tree_time.fill(-1)

    for i in leaf_size:
        learner = rt.RTLearner(leaf_size=i + 1, verbose=False)  # constructor
        rmses_in_3[i], rmses_out_3[i], rt_learner_tree_size[
            i], rt_learner_tree_time[i] = run_learner(learner, trainX, trainY,
                                                      testX,
                                                      testY)  # training step
    plt.plot(leaf_plot, rmses_in_3, label="In Sample")
    plt.plot(leaf_plot, rmses_out_3, label="Out Sample")
    plt.legend(loc="best")
    plt.grid(True)
    plt.title('RTLearner overfitting with increasing leaf size.', fontsize=10)
    plt.ylabel('RMSE')
    plt.xlabel('Leaf Size')
    plt.savefig('RTLearner overfitting with increasing leaf size.')
    plt.clf()
    plt.cla()
    plt.close()

    plt.plot(leaf_plot, rt_learner_tree_size, label="RTLearner")
    plt.plot(leaf_plot, dt_learner_tree_size, label="DTLearner")
    plt.legend(loc="best")
    plt.grid(True)
    plt.title('RTLearner vs DTLearner Tree Size.', fontsize=10)
    plt.ylabel('Tree Size')
    plt.xlabel('Leaf Size')
    plt.savefig('RTLearner vs DTLearner Tree Size.')
    plt.clf()
    plt.cla()
    plt.close()

    plt.plot(leaf_plot, rt_learner_tree_time, label="RTLearner")
    plt.plot(leaf_plot, dt_learner_tree_time, label="DTLearner")
    plt.legend(loc="best")
    plt.grid(True)
    plt.title('RTLearner vs DTLearner Tree Building Time.', fontsize=10)
    plt.ylabel('Time (Seconds)')
    plt.xlabel('Leaf Size')
    plt.savefig('RTLearner vs DTLearner Tree Building Time.')
    plt.clf()
    plt.cla()
    plt.close()

    rmses_in_5 = np.empty(100)
    rmses_in_5.fill(-1)
    rmses_out_5 = np.empty(100)
    rmses_out_5.fill(-1)
    for i in leaf_size:
        kwargs = {'leaf_size': i + 1}
        learner = bl.BagLearner(learner=dt.DTLearner,
                                kwargs=kwargs,
                                verbose=False,
                                bags=10)  # constructor
        rmses_in_5[i], rmses_out_5[i], ignore, ignore_2 = run_learner(
            learner, trainX, trainY, testX, testY)  # training step
    plt.plot(leaf_plot, rmses_in_5, label="In Sample")
    plt.plot(leaf_plot, rmses_out_5, label="Out Sample")
    plt.legend(loc="best")
    plt.grid(True)
    plt.title('Baglearner at 10 bags overfitting with increasing leaf size.',
              fontsize=10)
    plt.ylabel('RMSE')
    plt.xlabel('Leaf Size')
    plt.savefig('Baglearner at 10 bags overfitting with increasing leaf size.')
    plt.clf()
    plt.cla()
    plt.close()
Esempio n. 6
0
import DTLearner as dt
import RTLearner as rt
import BagLearner as bl
import sys
import matplotlib.pyplot as plt
import time
import scipy.stats as stats

if __name__ == "__main__":
    Path = './'
    if len(sys.argv) != 2:
        print "Usage: python testlearner.py <filename>"
        sys.exit(1)

    datafile = sys.argv[1]
    data = np.genfromtxt(util.get_learner_data_file(datafile), delimiter=',')

    if datafile == 'Istanbul.csv':
        data = data[1:, 1:]

    datasize = data.shape[0]
    cutoff = int(datasize * 0.6)
    leaf_sizes = range(1, 51)

    #Problem 1

    rmse_in = np.zeros((len(leaf_sizes), 5))
    rmse_out = np.zeros((len(leaf_sizes), 5))
    ind = 0

    for leaf_size in leaf_sizes:
Esempio n. 7
0
import DTLearner as dt
import RTLearner as rt
import BagLearner as bl
import util
import matplotlib.pyplot as plt
import sys


def author():
    return 'akarthik3'


if __name__ == "__main__":

    # Get istanbul.csv data and remove unwanted parts
    data = np.genfromtxt(util.get_learner_data_file('Istanbul.csv'),
                         delimiter=',')
    data = data[1:, 1:]

    # Separate data into training and testing (want every data point, split into 60-40 ratio between train/test)
    dataSize = int(0.6 * data.shape[0])
    xTraining = data[:dataSize, 0:-1]
    yTraining = data[:dataSize, -1]
    xTesting = data[dataSize:, 0:-1]
    yTesting = data[dataSize:, -1]

    # Test 1
    trainingRMSEs = np.zeros((100, 1))
    testingRMSEs = np.zeros((100, 1))
    for size in range(1, 101):
        learner = dt.DTLearner(size)
Esempio n. 8
0
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt



if __name__=="__main__":
    datafile='Istanbul.csv'

    testX, testY, trainX, trainY = None, None, None, None
    permutation = None
    author = None
    with util.get_learner_data_file(datafile) as f:
        alldata = np.genfromtxt(f, delimiter=',')
        # Skip the date column and header row if we're working on Istanbul data
        if datafile == 'Istanbul.csv':
            alldata = alldata[1:, 1:]
        datasize = alldata.shape[0]
        cutoff = int(datasize * 0.6)
        permutation = np.random.permutation(alldata.shape[0])
        col_permutation = np.random.permutation(alldata.shape[1] - 1)
        train_data = alldata[permutation[:cutoff], :]
        # trainX = train_data[:,:-1]
        trainX = train_data[:, col_permutation]
        trainY = train_data[:, -1]
        test_data = alldata[permutation[cutoff:], :]
        # testX = test_data[:,:-1]
        testX = test_data[:, col_permutation]
Esempio n. 9
0
        data = removedHeader

    # remove non-numerical (e.g. date) on the first column
    if np.isnan(col).all():
        removedFirstCol = data[:, 1:]
        data = removedFirstCol
    return data


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "Usage: python testlearner.py <filename>"
        sys.exit(1)

    fileParam = sys.argv[1]
    data = np.genfromtxt(util.get_learner_data_file(fileParam), delimiter=',')

    data = remove_header_col(data)

    # compute how much of the data is training and testing
    train_rows = int(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows

    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    # Question 1
    in_rmse_result_Q1, out_rmse_result_Q1, in_corr_result_Q1, out_corr_result_Q1 = question_one(
Esempio n. 10
0
import numpy as np
import math
import LinRegLearner as lrl
import sys
import util
import DTLearner as dt
import RTLearner as rt
import BagLearner as bl

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "Usage: python testlearner.py <filename>"
        sys.exit(1)
    inf = sys.argv[1]
    with util.get_learner_data_file(inf) as f:
        data = np.genfromtxt(f, delimiter=',')
        # Skip the date column and header row if we're working on Istanbul data
        if inf == 'Istanbul.csv':
            data = data[1:, 1:]
    # compute how much of the data is training and testing
    train_rows = int(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows

    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    print testX.shape
import util
import numpy as np
import math  		   	  			    		  		  		    	 		 		   		 		  
import LinRegLearner as lrl
import DTLearner as dt
import BagLearner as bl
import InsaneLearner as it
import RTLearner as rt
import sys
import time
  		   	  			    		  		  		    	 		 		   		 		  
if __name__=="__main__":  		   	  			    		  		  		    	 		 		   		 		  
    if len(sys.argv) != 2:  		   	  			    		  		  		    	 		 		   		 		  
        print "Usage: python testlearner.py <filename>"  		   	  			    		  		  		    	 		 		   		 		  
        sys.exit(1)  		   	  			    		  		  		    	 		 		   		 		  
    with util.get_learner_data_file(sys.argv[1]) as f:
        data = np.genfromtxt(f,delimiter=',')
        # Skip the date column and header row if we're working on Istanbul data
        if sys.argv[1] == 'Istanbul.csv':
            data = data[1:,1:]
  		   	  			    		  		  		    	 		 		   		 		  
    # compute how much of the data is training and testing  		   	  			    		  		  		    	 		 		   		 		  
    train_rows = int(0.6* data.shape[0])  		   	  			    		  		  		    	 		 		   		 		  
    test_rows = data.shape[0] - train_rows  		   	  			    		  		  		    	 		 		   		 		  
  		   	  			    		  		  		    	 		 		   		 		  
    # separate out training and testing data  		   	  			    		  		  		    	 		 		   		 		  
    trainX = data[:train_rows,0:-1]  		   	  			    		  		  		    	 		 		   		 		  
    trainY = data[:train_rows,-1]  		   	  			    		  		  		    	 		 		   		 		  
    testX = data[train_rows:,0:-1]  		   	  			    		  		  		    	 		 		   		 		  
    testY = data[train_rows:,-1]  		   	  			    		  		  		    	 		 		   		 		  
  		   	  			    		  		  		    	 		 		   		 		  
Esempio n. 12
0
import DTLearner as dt
import RTLearner as rt
import BagLearner as bl
import InsaneLearner as it
import sys
from util import get_learner_data_file
import matplotlib.pyplot as plt
import pandas as pd
import time

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "Usage: python testlearner.py <filename>"
        sys.exit(1)
    datafile = sys.argv[1]
    with get_learner_data_file(datafile) as f:
        data = np.genfromtxt(f, delimiter=',')
        # Skip the date column and header row if we're working on Istanbul data
        if datafile == 'Istanbul.csv':
            data = data[1:, 1:]

    # compute how much of the data is training and testing
    train_rows = int(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows

    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]
Esempio n. 13
0
def test_learners(description, group, datafile, seed, outputs, grader):
    """Test ML models returns correct predictions.

    Requires test description, test case group, inputs, expected outputs, and a grader fixture.
    """

    points_earned = 0.0  # initialize points for this test case
    try:
        learner_class = None
        kwargs = {'verbose': False}

        # (BPH) Copied from grade_strategy_qlearning.py
        #Set fixed seed for repetability
        np.random.seed(seed)
        random.seed(seed)
        # These lines will be uncommented in the batch grader to
        # prevent accidentally fixing the seed within student
        # code
        # tmp_numpy_seed = np.random.seed
        # tmp_random_seed = random.seed
        # np.random.seed = fake_seed
        # random.seed = fake_rseed

        # Try to import KNNLearner (only once)
        # if not 'KNNLearner' in globals():
        #     from KNNLearner import KNNLearner
        if not 'RTLearner' in globals():
            from RTLearner import RTLearner
        if group is 'BagLearner' and (not 'BagLearner' in globals()):
            from BagLearner import BagLearner

        # Tweak kwargs
        # kwargs.update(inputs.get('kwargs', {}))

        # Read separate training and testing data files
        # with open(inputs['train_file']) as f:
        # data_partitions=list()
        testX, testY, trainX, trainY = None, None, None, None
        permutation = None
        author = None
        with util.get_learner_data_file(datafile) as f:
            alldata = np.genfromtxt(f, delimiter=',')
            # Skip the date column and header row if we're working on Istanbul data
            if datafile == 'Istanbul.csv':
                alldata = alldata[1:, 1:]
            datasize = alldata.shape[0]
            cutoff = int(datasize * 0.6)
            permutation = np.random.permutation(alldata.shape[0])
            col_permutation = np.random.permutation(alldata.shape[1] - 1)
            train_data = alldata[permutation[:cutoff], :]
            # trainX = train_data[:,:-1]
            trainX = train_data[:, col_permutation]
            trainY = train_data[:, -1]
            test_data = alldata[permutation[cutoff:], :]
            # testX = test_data[:,:-1]
            testX = test_data[:, col_permutation]
            testY = test_data[:, -1]

        if group is "RTLearner":
            corr_in, corr_out, corr_in_50 = None, None, None

            def oneleaf():
                learner = RTLearner(leaf_size=1, verbose=False)
                learner.addEvidence(trainX, trainY)
                insample = learner.query(trainX)
                outsample = learner.query(testX)
                return insample, outsample, learner.author()

            def fiftyleaves():
                learner = RTLearner(leaf_size=50, verbose=False)
                learner.addEvidence(trainX, trainY)
                return learner.query(trainX)

            predY_in, predY_out, author = run_with_timeout(
                oneleaf, seconds_per_test_case, (), {})
            predY_in_50 = run_with_timeout(fiftyleaves, seconds_per_test_case,
                                           (), {})
            corr_in = np.corrcoef(predY_in, y=trainY)[0, 1]
            corr_out = np.corrcoef(predY_out, y=testY)[0, 1]
            corr_in_50 = np.corrcoef(predY_in_50, y=trainY)[0, 1]
            incorrect = False

            msgs = []
            if corr_in < outputs['insample_corr_min']:
                incorrect = True
                msgs.append(
                    "    In-sample with leaf_size=1 correlation less than allowed: got {} expected {}"
                    .format(corr_in, outputs['insample_corr_min']))
            else:
                points_earned += 1.5
            if corr_out < outputs['outsample_corr_min']:
                incorrect = True
                msgs.append(
                    "    Out-of-sample correlation less than allowed: got {} expected {}"
                    .format(corr_out, outputs['outsample_corr_min']))
            else:
                points_earned += 1.5
            if corr_in_50 > outputs['insample_corr_max']:
                incorrect = True
                msgs.append(
                    "    In-sample correlation with leaf_size=50 greater than allowed: got {} expected {}"
                    .format(corr_in_50, outputs['insample_corr_max']))
            else:
                points_earned += 1.0
            # Check author string
            if (author is None) or (author == 'tb34'):
                incorrect = True
                msgs.append("    Invalid author: {}".format(author))
                points_earned += -1.0

        elif group is "BagLearner":
            corr1, corr20 = None, None

            def onebag():
                learner1 = BagLearner(learner=RTLearner,
                                      kwargs={"leaf_size": 1},
                                      bags=1,
                                      boost=False,
                                      verbose=False)
                learner1.addEvidence(trainX, trainY)
                return learner1.query(testX), learner1.author()

            def twentybags():
                learner20 = BagLearner(learner=RTLearner,
                                       kwargs={"leaf_size": 1},
                                       bags=20,
                                       boost=False,
                                       verbose=False)
                learner20.addEvidence(trainX, trainY)
                return learner20.query(testX)

            predY1, author = run_with_timeout(onebag,
                                              seconds_per_test_case,
                                              pos_args=(),
                                              keyword_args={})
            predY20 = run_with_timeout(twentybags, seconds_per_test_case, (),
                                       {})

            corr1 = np.corrcoef(predY1, testY)[0, 1]
            corr20 = np.corrcoef(predY20, testY)[0, 1]
            incorrect = False
            msgs = []
            if corr20 <= corr1:
                incorrect = True
                msgs.append(
                    "    Out-of-sample correlation for 20 bags is not greater than for 1 bag. 20 bags:{}, 1 bag:{}"
                    .format(corr20, corr1))
            else:
                points_earned += 2.0
            # Check author string
            if (author is None) or (author == 'tb34'):
                incorrect = True
                msgs.append("    Invalid author: {}".format(author))
                points_earned += -1.0

        if incorrect:
            inputs_str = "    data file: {}\n" \
                         "    permutation: {}".format(datafile, permutation)
            raise IncorrectOutput, "Test failed on one or more output criteria.\n  Inputs:\n{}\n  Failures:\n{}".format(
                inputs_str, "\n".join(msgs))
    except Exception as e:
        # Test result: failed
        msg = "Description: {} (group: {})\n".format(description, group)

        # Generate a filtered stacktrace, only showing erroneous lines in student file(s)
        tb_list = tb.extract_tb(sys.exc_info()[2])
        for i in xrange(len(tb_list)):
            row = tb_list[i]
            tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3]
                          )  # show only filename instead of long absolute path
        tb_list = [
            row for row in tb_list
            if (row[0] == 'RTLearner.py') or (row[0] == 'BagLearner.py')
        ]
        if tb_list:
            msg += "Traceback:\n"
            msg += ''.join(tb.format_list(tb_list))  # contains newlines
        msg += "{}: {}".format(e.__class__.__name__, e.message)

        # Report failure result to grader, with stacktrace
        grader.add_result(
            GradeResult(outcome='failed', points=points_earned, msg=msg))
        raise
    else:
        # Test result: passed (no exceptions)
        grader.add_result(
            GradeResult(outcome='passed', points=points_earned, msg=None))