def test_GLM2_score_same(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename y = "10" kwargs = {'response': y, 'alpha': 0, 'family': 'gaussian'} (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x modelPathname = SYNDATASETS_DIR + '/model_' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x modelPathname = SYNDATASETS_DIR + '/model_' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) kmeans_doit(self, csvFilename, csvPathname, num_rows=1000000, timeoutSecs=60) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, pathname2xShuf, num_rows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, pathname4x, num_rows=4000000, timeoutSecs=120)
def test_KMeans_hastie_shuffle_fvec(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard/' + csvFilename bucket = 'home-0xdiag-datasets' kmeans_doit(self, csvFilename, bucket, csvPathname, numRows=1000000, timeoutSecs=60) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, None, pathname2xShuf, numRows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, None, pathname4x, numRows=4000000, timeoutSecs=120)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} (modelKey, validations1) = glm_doit(self, csvFilename, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, pathname2x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, pathname4x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60)
def test_GLM_hastie(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, None, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self, csvFilename, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) glm_doit(self, filename4x, pathname4x, timeoutSecs=120)
def test_GLM_hastie_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" bucket = 'home-0xdiag-datasets' csvPathname = 'standard' + '/' + csvFilename fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x) glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=45) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self,filename4x, pathname4x, timeoutSecs=60)
def test_GLM_hastie(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" bucket = 'datasets' csvPathname = 'logreg' + '/' + csvFilename fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x) glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=75) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self,filename4x, pathname4x, timeoutSecs=150)
def test_GLM2_score_same(self): h2o.beta_features = True # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename y = "10" kwargs = {'response': y, 'alpha': 0, 'family': 'gaussian'} (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x modelPathname = SYNDATASETS_DIR + '/model_' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x modelPathname = SYNDATASETS_DIR + '/model_' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=300) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=300)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self, csvFilename, csvPathname, timeoutSecs=300) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) glm_doit(self, filename2x, pathname2x, timeoutSecs=300)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300) fullPathname = h2i.find_folder_and_filename('home-0xdiag-datasets', csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'logreg' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300) fullPathname = h2i.find_folder_and_filename('datasets', csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
def test_GLM_score_same(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'logreg' + '/' + csvFilename y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} (modelKey, validations1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60)