def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 10 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_newargs(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') key = 'covtype.20k' parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname y = "106" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) for trial in xrange(3): sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\nTrial #", trial
def glm_score(self, csvFilename, csvPathname, modelKey, thresholds="0.5", timeoutSecs=30, pollTimeoutSecs=30): print "\nStarting GLM score of", csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, # the results # should be similar? # UPDATE: format for returning results is slightly different than normal GLM validation = glmScore['validation'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validation, self.validations1) else: self.validations1 = copy.deepcopy(validation)
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_many_cols_and_values_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 10, 'cA', 30), (100, 1000, 'cB', 30), # (100, 900, 'cC', 30), # (100, 500, 'cD', 30), # (100, 100, 'cE', 30), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: for sel in range(48): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cI', 5), (100, 5000, 'cA', 5), (100, 6000, 'cB', 5), (100, 7000, 'cC', 5), (100, 8000, 'cD', 5), (100, 8200, 'cE', 5), (100, 8500, 'cF', 5), (100, 9000, 'cG', 5), (100, 11000, 'cH', 5), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GLM_big1_nopoll(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname y = "106" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(40): kwargs = {'x': x, 'y': y, 'n_folds': 1} # FIX! what model keys do these get? glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for glm in glmInitial: print "Checking completed job, with no polling:", glm a = h2o.nodes[0].poll_url(glm['response'], noPoll=True) h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
def test_parse_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 10), (100, 6000, 'cB', 10), (100, 7000, 'cC', 10), (100, 8000, 'cD', 10), (100, 8200, 'cE', 10), (100, 8500, 'cF', 10), (100, 9000, 'cG', 10), (100, 10000, 'cI', 10), (100, 11000, 'cH', 10), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=60) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_GLM_syn_2659x1049x2enum(self): csvFilename = "syn_2659x1049x2enum.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = params glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=240, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_B_randomdata2_1_lineend(self): print "Using smalldata/datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" # change lineend, case 1 csvPathname1 = h2o.find_file('smalldata/datagen1.csv') csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv' infile = open(csvPathname1, 'r') outfile = open(csvPathname2, 'w') # existing file gets erased # assume all the test files are unix lineend. # I guess there shouldn't be any "in-between" ones # okay if they change I guess. for line in infile.readlines(): outfile.write(line.strip("\n") + "\r") infile.close() outfile.close() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname2, timeoutSecs=10, header=1, separator=44) h2o_cmd.runRFOnly(parseKey=parseKey, trees=1, response_variable=2, timeoutSecs=10, csvPathname=csvPathname2)
def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = "1,0,65,1,2,1,1.4,0,6" totalRows = 99860 write_syn_dataset(csvPathname, totalRows, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and key2 names for each trial" for trial in range(200): append_syn_dataset(csvPathname, rowData) totalRows += 1 ### start = time.time() # this was useful to cause failures early on. Not needed eventually ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv")) ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds' start = time.time() key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2) print "trial #", trial, "totalRows:", totalRows, "parse end on ", \ csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=key2) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_exec_filter_slice2(self): timeoutSecs = 10 csvFilename = "covtype.data" csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') key2 = "c" parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c', 10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) for trial in range(10): print "Doing the execs in order, to feed filters into slices" nodeX = 0 for exprTemplate in exprList: execExpr = h2e.fill_in_expr_template(exprTemplate, colX=0, n=0, row=1, key2=key2, m=2) time.sleep(2) h2o.check_sandbox_for_errors() execResultInspect, min_value = h2e.exec_expr( h2o.nodes[nodeX], execExpr, resultKey="Result.hex", timeoutSecs=4) print "min_value:", min_value, "execExpr:", execExpr h2o.verboseprint("min: ", min_value, "trial:", trial)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'k': 2, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'x': x, 'y': y, 'case': -1} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) # pop open a browser on the cloud h2b.browseTheCloud() # build up the parameter string in X y = "106" x = "" # go right to the big X and iterate on that case ### for trial in range(2): for trial in range(2): print "\nTrial #", trial, "start" print "\nx:", x print "y:", y start = time.time() kwargs = {'y': y} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) h2o.check_sandbox_for_errors() ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") print "\nTrial #", trial
def test_prostate_then_prostate_long_parse(self): print "\nput and parse of same file, but both key and key2 are the h2o defaults..always different" for trial in range(10): start = time.time() key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate_long.csv.gz")) print "trial #", trial, "parse end on ", "prostate_long.csv.gz", "took", time.time() - start, "seconds" h2o.check_sandbox_for_errors()
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange(1, 10, 1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseKey = h2o_cmd.parseFile(None, csvPathname) h2o.verboseprint("Trial", trial) h2o_cmd.runRFOnly(parseKey=parseKey, trees=237, depth=45, timeoutSecs=120) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_GLM_poisson_1(self): csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1 == 0): print "WARNING: just doing the first 33 features, for comparison to ??? numbers" # pythonic! x = ",".join(map(str, range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'poisson', 'link': 'log', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } timeoutSecs = 120 # L2 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'num_cross_validation_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_rf_big1_nopoll(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(1): start = time.time() kwargs = {} # FIX! what model keys do these get? rfView = h2o_cmd.runRFOnly(parseKey=parseKey, model_key="RF_model"+str(jobDispatch),\ timeoutSecs=300, noPoll=True, **kwargs) rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=30, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for rfView in rfViewInitial: print "Checking completed job, with no polling:", rfView a = h2o.nodes[0].poll_url(rf['response'], noPoll=True) h2o_rf.simpleCheckRFView(None, a)
def test_GLM_gamma_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 24 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def test_E_ParseManyCols(self): csvPathname = h2o.find_file('smalldata/fail1_100x11000.csv.gz') parseKey = h2o_cmd.parseFile(None, csvPathname, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], offset=-1, view=5)
def test_GLM_params_rand2_8977501266014959103(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) # SEED = random.randint(0, sys.maxint) SEED = 8977501266014959103 # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'alpha': 0, 'lambda': 0, 'case': 1, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_B_benign_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])
def inspect_columns(self, filename, rows=1, cols=26, columnNames=crange('A', 'Z'), columnTypes=None): cvsfile = h2o.find_file(filename) node = h2o.nodes[0] res = h2o_cmd.parseFile(node=node, csvPathname=cvsfile) ary = node.inspect(res['destination_key']) self.assertEqual(rows, ary['num_rows']) self.assertEqual(cols, ary['num_cols']) # check column names if not columnNames is None: for (col, expName) in zip(ary['cols'], columnNames): self.assertEqual(expName, col['name']) # check column types if not columnTypes is None: for (col, expType) in zip(ary['cols'], columnTypes): self.assertEqual(expType, col['type']) return ary
def test_C_prostate_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key']) h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_sort_of_prostate_with_row_schmoo(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = "1,0,65,1,2,1,1.4,0,6" write_syn_dataset(csvPathname, 99860, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" print "Updating the key and key2 names for each trial" for trial in range (200): append_syn_dataset(csvPathname, rowData) ### start = time.time() # this was useful to cause failures early on. Not needed eventually ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv")) ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds' start = time.time() key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2) print "trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' h2o_cmd.runInspect(key=key2) # only used this for debug to look at parse (red last row) on failure ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_glm_covtype_single_cols(self): timeoutSecs = 10 csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "\n" + csvPathname # columns start at 0 y = "54" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) print "GLM binomial wth 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(54): if x == "": x = str(colX) else: # x = x + "," + str(colX) x = str(colX) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = h2o.find_dataset(csvFilename) # creates csvFilename and csvFilename.hex keys parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_many_cols_with_syn(self): ### h2b.browseTheCloud() csvFilename = "logreg_trisum_int_cat_10000x10.csv" csvPathname = "smalldata/logreg/" + csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict = define_params() paramDict2 = {} for k in paramDict: # sometimes we have a list to pick from in the value. now it's just list of 1. paramDict2[k] = paramDict[k][0] y = 10 # FIX! what should we have for case? 1 should be okay because we have 1's in output col kwargs = {'y': y, 'max_iter': 50} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_factor_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() # use SEED so the file isn't cached? csvFilenameAll = [ ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5), ] ### csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (csvFilename, key2, timeoutSecs) in csvFilenameList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random 1mx8 csv" write_syn_dataset(csvPathname, 1000000, SEEDPERFILE) # creates csvFilename.hex from file in importFolder dir parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename h2e.exec_zero_list(zeroList) # does n+1 so use maxCol 6 h2e.exec_expr_list_rand(lenNodes, exprList, key2, maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseKey = h2o_cmd.parseFile(None, csvPathname) h2o.verboseprint("Trial", trial) h2o_cmd.runRFOnly(parseKey=parseKey, trees=237, depth=45, timeoutSecs=120) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str,range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # compare this kmeans to the first one. since the files are replications, the results # should be similar? inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # Took num_cross_validation_folds out, because GLM doesn't include num_cross_validation_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'x': x, 'y': y, 'case': -1} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_rf_covtype_train_full(self): csvFilename = 'train.csv' csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename) print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() rfView = h2o_cmd.runRF(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix'][ 'classification_error'] self.assertLess( classification_error, 0.02, "train.csv should have full classification error <0.02") print "Trial #", trial, "completed"
def test_sort_of_prostate_with_row_schmoo(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" rowData = rand_rowData() write_syn_dataset(csvPathname, 1, headerData, rowData) print "This is the same format/data file used by test_same_parse, but the non-gzed version" print "\nSchmoo the # of rows" for trial in range (100): rowData = rand_rowData() num = random.randint(1, 10096) append_syn_dataset(csvPathname, rowData, num) start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) key = csvFilename + "_" + str(trial) key2 = csvFilename + "_" + str(trial) + ".hex" key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, timeoutSecs=70, pollTimeoutSecs=60) print "trial #", trial, "with num rows:", num, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=key2) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting parse of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have kwargs = { 'x': x, 'y': y, 'case': '1', 'destination_key': 'gg', # better classifier it flipped? (better AUC?) 'max_iter': 10, 'case': -1, 'case_mode': '=', 'num_cross_validation_folds': 0, 'lambda': '1e-8,1e-4,1e-3', 'alpha': '0,0.25,0.8', # hardwire threshold to 0.5 because the dataset is so senstive right around threshold # otherwise, GLMGrid will pick a model with zero coefficients, if it has the best AUC # to avoid my checker complaining about all zero coefficients, force the threshold to 0.5 'thresholds': '0.5', # 'thresholds': '0.2:0.8:0.1' } start = time.time() print "\nStarting GLMGrid of", csvFilename glmGridResult = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLMGrid in", (time.time() - start), "secs (python)" # still get zero coeffs..best model is AUC = 0.5 with intercept only. h2o_glm.simpleCheckGLMGrid(self,glmGridResult, allowZeroCoeff=True,**kwargs)
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex'} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # cluster centers can return in any order centersSorted = sorted(centers, key=itemgetter(0)) self.assertAlmostEqual(centersSorted[0][0],100,delta=.2) self.assertAlmostEqual(centersSorted[1][0],200,delta=.2) self.assertAlmostEqual(centersSorted[2][0],300,delta=.2) self.assertAlmostEqual(centersSorted[0][1],100,delta=.2) self.assertAlmostEqual(centersSorted[1][1],200,delta=.2) self.assertAlmostEqual(centersSorted[2][1],300,delta=.2) self.assertAlmostEqual(centersSorted[0][2],100,delta=.2) self.assertAlmostEqual(centersSorted[1][2],200,delta=.2) self.assertAlmostEqual(centersSorted[2][2],300,delta=.2) show_results(csvPathname, parseKey, model_key, centers, 'd')
def test_GLM_params_rand2(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k") # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_many_cols_and_types(self): SEED = random.randint(0, sys.maxint) print "\nUsing random seed:", SEED # SEED = random.seed(SEED) SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5, "cA", 5), (1000, 59, "cB", 5), (5000, 128, "cC", 5), (6000, 507, "cD", 5), (9000, 663, "cE", 5), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvFilename
def test_C_prostate(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) sys.stdout.write('.') sys.stdout.flush() h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_many_cols_and_types(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5, 'cA', 5), (1000, 59, 'cB', 5), (5000, 128, 'cC', 5), (6000, 507, 'cD', 5), (9000, 663, 'cE', 5), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvFilename