def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) # pop open a browser on the cloud h2b.browseTheCloud() # build up the parameter string in X y = "106" x = "" # go right to the big X and iterate on that case ### for trial in range(2): for trial in range(2): print "\nTrial #", trial, "start" print "\nx:", x print "y:", y start = time.time() kwargs = {'y': y} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) h2o.check_sandbox_for_errors() ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") print "\nTrial #", trial
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_GLM_poisson_rand2(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*40) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # Took num_cross_validation_folds out, because GLM doesn't include num_cross_validation_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'x': x, 'y': y, 'case': -1} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_NOPASS_GLM2_weight_nan_fail(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') kwargs = { 'destination_key': 'GLM_model_python_0_default_0', 'family': 'tweedie', 'tweedie_variance_power': 1.9999999, 'max_iter': 10, 'alpha': 0, 'lambda': 0, 'response': 54, } for trial in range(3): # params is mutable. This is default. start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) h2o.check_sandbox_for_errors() # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_params_rand2(self): csvPathname = 'covtype/covtype.20k.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k") CLASS = 1 # make a binomial version execExpr="B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ('covtype.20k', 54+1, 54+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'response': 54, 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, 'n_folds': 1, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() if 'family' not in kwargs or kwargs['family']=='binomial': bHack = {'destination_key': 'B.hex'} else: bHack = parseResult start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_big1_nopoll(self): csvPathname = 'hhp_107_01.data.gz' print "\n" + csvPathname y = "106" x = "" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(10): kwargs = {'x': x, 'y': y, 'n_folds': 1} # FIX! what model keys do these get? glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for glm in glmInitial: print "Checking completed job, with no polling using initial response:", h2o.dump_json(glm) a = h2o.nodes[0].poll_url(glm, noPoll=True) h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
def test_GLM2_syn_2659x1049x2enum(self): csvFilename = "syn_2659x1049x2enum.csv" csvPathname = "logreg" + "/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") kwargs = params glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=240, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_glm_covtype_single_cols(self): timeoutSecs = 10 csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "\n" + csvPathname # columns start at 0 y = "54" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) print "GLM binomial wth 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(54): if x == "": x = str(colX) else: # x = x + "," + str(colX) x = str(colX) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_B_benign_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])
def test_C_prostate_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key']) h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_C_prostate(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = "logreg" + "/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") for maxx in range(2, 6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str, x)) print "\nx:", x print "y:", y kwargs = {"x": x, "y": y, "n_folds": 5} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, "AGE", **kwargs) sys.stdout.write(".") sys.stdout.flush() # now redo it all thru the browser # three times! for i in range(3): h2b.browseJsonHistoryAsUrl() h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = "logreg" + "/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11, 14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str, x)) print "\nx:", x print "y:", y kwargs = {"x": x, "y": y} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write(".") sys.stdout.flush() # now redo it all thru the browser h2b.browseJsonHistoryAsUrl()
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'num_cross_validation_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = 'logreg' + '/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def test_C_prostate(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) sys.stdout.write('.') sys.stdout.flush() h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) y = "10" x = "" # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values 1,-1. need to specify case kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python)" h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_GLM_params_rand2(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k") # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_princeton(self): # filename, y, timeoutSecs # these are all counts? using gaussian? csvFilenameList = [ ('cuse.dat', 'gaussian', 3, 10), # notUsing ('cuse.dat', 'gaussian', 4, 10), # using ('copen.dat', 'gaussian', 4, 10), ('housing.raw', 'gaussian', 4, 10), ] trial = 0 for (csvFilename, family, y, timeoutSecs) in csvFilenameList: csvPathname1 = 'logreg/princeton/' + csvFilename fullPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_stripped.csv' h2o_util.file_strip_trailing_spaces(fullPathname1, csvPathname2) parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=timeoutSecs) start = time.time() kwargs = {'n_folds': 0, 'family': family, 'response': y} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time() - start, 'seconds' trial += 1 print "\nTrial #", trial
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def test_many_cols_with_syn(self): ### h2b.browseTheCloud() csvFilename = "logreg_trisum_int_cat_10000x10.csv" csvPathname = "smalldata/logreg/" + csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict = define_params() paramDict2 = {} for k in paramDict: # sometimes we have a list to pick from in the value. now it's just list of 1. paramDict2[k] = paramDict[k][0] y = 10 # FIX! what should we have for case? 1 should be okay because we have 1's in output col kwargs = {'y': y, 'max_iter': 50} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_GLM2_dest_key(self): h2o.beta_features = True print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = 'logreg' + '/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') for maxx in [6]: destination_key='GLM_model_python_0_default_0' # illegal to have output col in the ignored_cols! kwargs = { 'ignored_cols': '0', 'response': y, 'n_folds': 5, 'destination_key': destination_key, } glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) h2o_destination_key = glm['glm_model']['_key'] print 'h2o_destination_key:', h2o_destination_key self.assertEqual(h2o_destination_key, destination_key, msg='I said to name the key %s, h2o used %s' % (destination_key, h2o_destination_key)) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=10) y = 10 # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'response': y, 'alpha': 0, 'family': 'binomial'} h2o.nodes[0].to_enum(src_key=parseResult['destination_key'], column_index=y+1) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? glm_model = glm['glm_model'] validation = glm_model['submodels'][0]['validation'] if self.validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, self.validation1) else: self.validation1 = copy.deepcopy(validation)
def test_GLM2_airline(self): #############Train############################### csvFilename = 'AirlinesTrain.csv.zip' csvPathname = 'airlines'+'/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'} kwargs = params.copy() starttime = time.time() glmtest = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) elapsedtime = time.time() - starttime print("ELAPSED TIME TRAIN DATA ",elapsedtime) h2o_glm.simpleCheckGLM(self, glmtest, None, **kwargs) ######### Test ###################################### csvFilename = 'AirlinesTest.csv.zip' csvPathname = 'airlines'+'/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'} kwargs = params.copy() starttime = time.time() glmtrain = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) elapsedtime = time.time() - starttime print("ELAPSED TIME TEST DATA ",elapsedtime) h2o_glm.simpleCheckGLM(self, glmtrain, None, **kwargs)
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM2_big2(self): csvPathname = "hhp_107_01.data.gz" print "\n" + csvPathname parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) ## h2b.browseTheCloud() y = "106" # go right to the big X and iterate on that case ### for trial in range(2): for trial in range(2): print "\nTrial #", trial, "start" start = time.time() kwargs = {'response': y, 'alpha': 0.0} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=200, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 'C58', **kwargs) h2o.check_sandbox_for_errors() ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") print "\nTrial #", trial
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='covtype.hex') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': 'binomial', 'max_iter': 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds'] * 10 + params['max_iter'] * 10) glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_GLM_poisson_rand2(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds'] * 40) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_newargs(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') key = 'covtype.20k' parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(50): # params is mutable. This is default. params = { 'y': 54, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_gaussian_rand2(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'response': 54, 'n_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM2_params_rand2(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k") CLASS = 1 # make a binomial version execExpr="B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ('covtype.20k', 54+1, 54+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'response': 54, 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, 'n_folds': 1, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() if 'family' not in kwargs or kwargs['family']=='binomial': bHack = {'destination_key': 'B.hex'} else: bHack = parseResult start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_params_rand2_newargs(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'response': 54, 'lambda': 0, 'alpha': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_tweedie(self): csvFilename = "AutoClaim.csv" csvPathname = 'standard/' + csvFilename print "\nStarting", csvPathname parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') # columns start at 0 # regress: glm(CLM_AMT ~ CAR_USE + REVOLKED + GENDER + AREA + MARRIED + CAR_TYPE, data=AutoClaim, family=tweedie(1.34)) # y = "4" coefs = [7, 13, 20, 27, 21, 11] # sapply(c('CLM_AMT', 'CAR_USE', 'REVOLKED', 'GENDER', 'AREA', 'MARRIED', 'CAR_TYPE'), function(x) which(x==colnames(AutoClaim)) - 1) x = ','.join([str(x) for x in coefs]) kwargs = { 'family': 'tweedie', 'tweedie_power': 1.36, 'y': y, 'x': x, 'max_iter': 10, 'lambda': 0, 'alpha': 0, 'weight': 1.0, 'n_folds': 0, 'beta_epsilon': 1e-4, } glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) coefficients.append(intercept) print 'coefficients: %s' % (str(coefficients)) coefTruth = [ -0.017, -0.009, -0.004, -0.054, 0.013, -0.006, 0.006, -0.017, -0.013, -0.004, 0.144 ] deltaCoeff = deltaIntcpt = 0.05 for i, c in enumerate(coefficients): g = coefTruth[i] print "coefficient[%d]: %8.4f, truth: %8.4f, delta: %8.4f" % ( i, c, g, abs(g - c)) self.assertAlmostEqual( c, g, delta=deltaCoeff, msg="not close enough. coefficient[%d]: %s, generated %s" % (i, c, g))
def test_GLM2_poisson_1(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'poisson', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } timeoutSecs = 120 # L2 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs) # L1 kwargs.update({'alpha': 0.75, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs)
def test_GLM_covtype(self): csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1==0): print "WARNING: just doing the first 33 features, for comparison to ??? numbers" # pythonic! x = ",".join(map(str,range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 2, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_eps': 1e-3} timeoutSecs = 120 # L2 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def process_dataset(self, parseResult, Y, e_coefs, e_ndev, e_rdev, e_aic, **kwargs): # no regularization kwargs['alpha'] = 0 kwargs['lambda'] = 0 kwargs['response'] = 'CAPSULE' glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=20, **kwargs) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response # the first submodel is the right one, if onely one lambda is provided as a parameter above glm_model = glmResult['glm_model'] submodels = glm_model['submodels'][0] validation = submodels['validation'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] errors = [] # FIX! our null deviance doesn't seem to match h2o.verboseprint("Comparing:", null_deviance, e_ndev) # if abs(float(nullDev) - e_ndev) > (0.001 * e_ndev): # errors.append('NullDeviance: %f != %s' % (e_ndev,nullDev)) # FIX! our res deviance doesn't seem to match h2o.verboseprint("Comparing:", residual_deviance, e_rdev) # if abs(float(resDev) - e_rdev) > (0.001 * e_rdev): # errors.append('ResDeviance: %f != %s' % (e_rdev,resDev)) # FIX! we don't have an AIC to compare? return errors
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) GLMModel = glm['GLMModel'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, 'C1', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_billion_rows(self): # just do the import folder once timeoutSecs = 1500 csvFilenameAll = [ # quick test first # "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path='standard/' + csvFilename, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList parameters = { 'response_column': 1, 'n_folds': 0, 'alpha': 0, 'lambda': 0, } model_key = 'B.hex' bmResult = h2o.n0.build_model(algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() labelListUsed = labelList h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)
def sub_c2_nongz_fvec_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern # double import still causing problems? # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # importFullList = importResult['files'] # importFailList = importResult['fails'] # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541 ]: x.remove(i) ignore_x.append(i) # plus 1 because we are no longer 0 offset x = ",".join(map(lambda x: "C" + str(x + 1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'family': 'binomial', 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_GLM_twovalues(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # H2O might not do whitespace stripping on numbers correctly, when , is {SEP} # GLM will auto expand categoricals..so if we have more coefficients than expected # that means it didn't parse right # mix in space/tab combos # just done like this for readability rowDataTrueRaw = \ "<sp>1,\ 0<sp>,\ <tab>65,\ 1<tab>,\ <sp><tab>2,\ 1<sp><tab>,\ <tab><sp>1,\ 4<tab><sp>,\ <tab><tab>1,\ 4<tab><tab>,\ <sp><sp>1,\ 4<sp><sp>" rowDataTrue = re.sub("<sp>"," ", rowDataTrueRaw) rowDataTrue = re.sub("<tab>"," ", rowDataTrue) rowDataFalse = \ "0,\ 1,\ 0,\ -1,\ -2,\ -1,\ -1,\ -4,\ -1,\ -4,\ -1,\ -4" twoValueList = [ ('A','B',0, 14), ('A','B',1, 14), (0,1,0, 12), (0,1,1, 12), (0,1,'NaN', 12), (1,0,'NaN', 12), (-1,1,0, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() key = csvFilename + "_" + str(trial) kwargs = {'case': case, 'y': 10, 'family': 'binomial', 'alpha': 0, 'beta_eps': 0.0002} # default takes 39 iterations? play with alpha/beta glm = h2o_cmd.runGLM(csvPathname=csvPathname, key=key) h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs) # check that the number of entries in coefficients is right (12 with intercept) coeffNum = len(glm['GLMModel']['coefficients']) if (coeffNum!=coeffNum): raise Exception("Should be " + coeffNum + " coefficients in result. %s" % coeffNum) print "trial #", trial, "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("GLM") h2o.check_sandbox_for_errors() trial += 1
def test_GLM_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA.hex', 100), (10000, 20, 'cB.hex', 200), (10000, 30, 'cC.hex', 300), (10000, 40, 'cD.hex', 400), (10000, 50, 'cE.hex', 500), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, key2, maxCol=colCount, timeoutSecs=90, incrementingResult=False) print "\nexec colResultList", colResultList paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] # since we add the output twice, it's no longer colCount-1 y = colCount kwargs = {'y': y, 'max_iter': 50, 'case': 1} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' # only col y-1 (next to last)doesn't get renamed in coefficients # due to enum/categorical expansion print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(3) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_GLM_enums_score_subset(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc
def test_GLM_both(self): h2o.beta_features = True if (1==1): csvFilenameList = [ ('logreg', 'benign.csv', 'binomial', 3, 10), # col is zero based # FIX! what's wrong here? index error ## ('uis.dat', 'binomial', 8, 5, False), ## ('pros.dat', 'binomial', 1, 10, False), ## ('chdage.dat', 'binomial', 2, 5, True), ## ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ## ('clslowbwt.dat', 'binomial', 7, 10, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ] else: csvFilenameList = [ # leave out ID and birth weight ('logreg', 'benign.csv', 'gaussian', 3, 10), (None, 'icu.dat', 'binomial', 1, 10), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'lowbwt.dat', 'binomial', 1, 10), (None, 'lowbwtm11.dat', 'binomial', 1, 10), (None, 'meexp.dat', 'gaussian', 3, 10), # FIX! does this one hang in R? (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'pbc.dat', 'gaussian', 1, 10), (None, 'pharynx.dat', 'gaussian', 12, 10), (None, 'uis.dat', 'binomial', 8, 10), ] trial = 0 for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList: # FIX! do something about this file munging if offset: csvPathname1 = offset + "/" + csvFilename else: csvPathname1 = 'logreg/umass_statdata/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(fullPathname, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) if h2o.beta_features: num_cols = inspect['numCols'] num_rows = inspect['numRows'] else: num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): if csvFilename=='clslowbwt.dat' and c==6: print "Not including col 6 for this dataset from x" if csvFilename=='benign.csv' and (c==0 or c==1): print "Not including col 0,1 for this dataset from x" else: # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x if h2o.beta_features: kwargs = { 'n_folds': 0, 'response': y, # what about x? 'family': family, 'alpha': 0, 'lambda': 0, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } else: kwargs = { 'n_folds': 0, 'y': y, 'x': x, 'family': family, 'alpha': 0, 'lambda': 1e-4, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } if csvFilename=='benign.csv': kwargs['ignored_cols'] = '0,1' if csvFilename=='clslowbwt.dat': kwargs['ignored_cols'] = '6' start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial
def sub_c2_rel_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 116561140 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath if len(h2o.nodes)==1: csvFilenameList= [ ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600), ] else: csvFilenameList= [ ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]: for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]: x.remove(i) ignore_x.append(i) # increment by one, because we are no long zero offset! x = ",".join(map(lambda x: "C" + str(x+1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'family': 'binomial', 'x': x, 'y': 'C379', 'case': 15, 'case_mode': '>', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_GLM2_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'n_folds': 0, 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-5 }, # {'alpha': 0.25, 'lambda': 1e-4}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, vactual='C' + str(y + 1), vpredict=1, expectedAuc=0.45)
def test_poisson_covtype20x(self): h2o.beta_features = True if localhost: csvFilenameList = [ ('covtype20x.data', 480), ] else: csvFilenameList = [ # ('covtype200x.data', 1000), ('covtype20x.data', 480), ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) if (1 == 0): print "WARNING: just doing the first 33 features, for comparison to allstate numbers" # pythonic! x = ",".join(map(str, range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'poisson', 'n_folds': 0, # 'case_mode': '=', # 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs)
def test_benchmark_import(self): covtype200xSize = 15033863400 csvFilenameList = [ ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), ] trialMax = 1 base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap/2, base_port=base_port, enable_benchmark_log=True) for trial in range(trialMax): csvPathname = "/home/0xdiag/datasets/standard/" + csvFilepattern h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern #********************************************************************************** if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(54) # don't include the output column x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 54, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o.tear_down_cloud() sys.stdout.write('.') sys.stdout.flush()
def do_h2o_glm(self, bucket, csvPathname, L, family='binomial'): h2p.red_print("\nNow doing h2o") parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', timeoutSecs=180) # save the resolved pathname for use in the sklearn csv read below inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print inspect print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) x = 'ID' y = 'CAPSULE' family = family alpha = '0' lambda_ = L nfolds = '0' f = 'prostate' modelKey = 'GLM_' + f kwargs = { 'response' : y, 'ignored_cols' : x, 'family' : family, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, # passes if 0, fails otherwise 'destination_key' : modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o alpha ", alpha) h2p.green_print("h2o lambda ", lambda_) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response glm_model = glmResult['glm_model'] _names = glm_model['_names'] coefficients_names = glm_model['coefficients_names'] # the first submodel is the right one, if onely one lambda is provided as a parameter above submodels = glm_model['submodels'][0] beta = submodels['beta'] h2p.red_print("beta:", beta) norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc
def test_GLM2_mnist(self): if not SCIPY_INSTALLED: pass else: SYNDATASETS_DIR = h2o.make_syn_dir() csvFilelist = [ (10000, 500, 'cA', 60), ] trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvFilename = 'syn_' + "binary" + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, rowCount, colCount) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** modelKey = 'GLM_model' y = colCount kwargs = { 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0, 'max_iter': 15, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } # GLM wants the output col to be strictly 0,1 integer execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % ( hex_key, y + 1, y + 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'aHack'} timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? lambdaMax = glm['glm_model']['lambda_max'] print "lambdaMax:", lambdaMax best_threshold = glm['glm_model']['submodels'][0][ 'validation']['best_threshold'] print "best_threshold", best_threshold # pick the middle one? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][ '_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above print "\nPredict\n==========\n" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='aHack', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='aHack', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 50, "Should see less than 50% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_poisson_covtype20x(self): if localhost: csvFilenameList = [ ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = 'standard' for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1 == 0): print "WARNING: just doing the first 33 features, for comparison to ??? numbers" # pythonic! x = ",".join(map(str, range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (Elastic) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs) # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L1) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # for now, take too long on 2x100GB heap on 164 # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/standard' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack','iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10) # java_extra_args=jea, # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandboxIgnoreErrors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3') else: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs') foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?") src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] src_key = URI + csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]: x.remove(i) x = ",".join(map(str,x)) if DO_GLM: algo = 'GLM' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_GLM_100Mx70_hosts(self): # enable this if you need to re-create the file if 1 == 0: SYNDATASETS_DIR = h2o.make_syn_dir() createList = [ (100000000, 70, 'cA', 10000), ] for (rowCount, colCount, hex_key, timeoutSecs) in createList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # Have to copy it to /home/0xdiag/datasets! # None is okay for hex_key csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv', 500, 'rand_100Mx70.hex'), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000, retryDelaySecs=5, initialDelaySecs=10, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) csvPathname = importFolderPath + "/" + csvFilename numRows = inspect['numRows'] numCols = inspect['numCols'] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) y = numCols - 1 kwargs = { 'family': 'binomial', 'link': 'logit', 'y': y, 'max_iter': 8, 'n_folds': 0, 'beta_epsilon': 1e-4, 'alpha': 0, 'lambda': 0 } for trial in range(3): start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_GLM_many_cols_tridist(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 20, 'cB', 300), (10000, 30, 'cC', 300), (10000, 40, 'cD', 300), (10000, 50, 'cE', 300), (10000, 60, 'cF', 300), (10000, 70, 'cG', 300), (10000, 80, 'cH', 300), (10000, 90, 'cI', 300), (10000, 100, 'cJ', 300), (10000, 200, 'cK', 300), (10000, 300, 'cL', 300), (10000, 400, 'cM', 300), (10000, 500, 'cN', 300), (10000, 600, 'cO', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "\nParse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] y = colCount kwargs = {'y': y} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)