def test_GLM2_airline(self): #############Train############################### csvFilename = 'AirlinesTrain.csv.zip' csvPathname = 'airlines'+'/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'} kwargs = params.copy() starttime = time.time() glmtest = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) elapsedtime = time.time() - starttime print("ELAPSED TIME TRAIN DATA ",elapsedtime) h2o_glm.simpleCheckGLM(self, glmtest, None, **kwargs) ######### Test ###################################### csvFilename = 'AirlinesTest.csv.zip' csvPathname = 'airlines'+'/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'} kwargs = params.copy() starttime = time.time() glmtrain = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) elapsedtime = time.time() - starttime print("ELAPSED TIME TEST DATA ",elapsedtime) h2o_glm.simpleCheckGLM(self, glmtrain, None, **kwargs)
def test_GLM2_tnc3_10(self): h2o.beta_features = True csvFilename = 'tnc3_10.csv' print "\n" + csvFilename hex_key = "tnc3.hex" h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'response': 13, 'n_folds': 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_GLM_covtype(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" x = "" print "Touching it with exec to trigger va to fvec (covtype.hex) , and then fvec to va (covtype2.hex)" h2o_cmd.runExec(str='%s=%s' % ('covtype2.hex', hex_key)) # hack to use the new one parseResult['destination_key'] = 'covtype2.hex' # L2 kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3} timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs)
def test_GLM_covtype(self): csvFilename = 'covtype.data' csvPathname = 'UCI/UCI-large/covtype/' + csvFilename parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put',timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1==0): print "WARNING: just doing the first 33 features, for comparison to allstate numbers" # pythonic! x = ",".join(map(str,range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" # L2 kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3} timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def test_GLM2_covtype_exec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" h2o_cmd.runExec(str='%s[,55] = %s[,55]==1' % (hex_key, hex_key)) # L2 kwargs = { 'response': y, 'family': 'binomial', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3} timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs)
def test_GLM2_tnc3_10(self): h2o.beta_features = True csvFilename = "tnc3_10.csv" print "\n" + csvFilename hex_key = "tnc3.hex" parseResult = h2i.import_parse( bucket="smalldata", path=csvFilename, schema="put", hex_key=hex_key, timeoutSecs=10 ) print "Parse result['Key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) ### time.sleep(10) if 1 == 0: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols( lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after num swap", colResultList if 1 == 1: start = time.time() kwargs = {"response": 13, "n_folds": 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseResult # in any case, the destination_key in parseResult was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, "took", time.time() - start, "seconds" inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) # ****************** if 1 == 0: colResultList = h2e.exec_expr_list_across_cols( lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10 ) print "\ncolResultList after char swap", colResultList if 1 == 1: start = time.time() kwargs = {"response": 13, "n_folds": 6} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvFilename, "took", time.time() - start, "seconds" inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
def test_GLM_covtype_single_cols(self): timeoutSecs = 10 csvPathname = 'UCI/UCI-large/covtype/covtype.data' print "\n" + csvPathname # columns start at 0 y = "54" x = "" parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', timeoutSecs=15) print "GLM binomial wth 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(54): if x == "": x = str(colX) else: # x = x + "," + str(colX) x = str(colX) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_C_prostate(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = "logreg" + "/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") for maxx in range(2, 6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str, x)) print "\nx:", x print "y:", y kwargs = {"x": x, "y": y, "n_folds": 5} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, "AGE", **kwargs) sys.stdout.write(".") sys.stdout.flush() # now redo it all thru the browser # three times! for i in range(3): h2b.browseJsonHistoryAsUrl() h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = "logreg" + "/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11, 14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str, x)) print "\nx:", x print "y:", y kwargs = {"x": x, "y": y} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write(".") sys.stdout.flush() # now redo it all thru the browser h2b.browseJsonHistoryAsUrl()
def test_GLM2_params_rand2(self): csvPathname = 'covtype/covtype.20k.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k") CLASS = 1 # make a binomial version execExpr="B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ('covtype.20k', 54+1, 54+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'response': 54, 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, 'n_folds': 1, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() if 'family' not in kwargs or kwargs['family']=='binomial': bHack = {'destination_key': 'B.hex'} else: bHack = parseResult start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_big1_nopoll(self): csvPathname = 'hhp_107_01.data.gz' print "\n" + csvPathname y = "106" x = "" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(10): kwargs = {'x': x, 'y': y, 'n_folds': 1} # FIX! what model keys do these get? glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for glm in glmInitial: print "Checking completed job, with no polling using initial response:", h2o.dump_json(glm) a = h2o.nodes[0].poll_url(glm, noPoll=True) h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
def test_GLM_params_rand2_newargs(self): csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() y = 54 print "Want to see if there are constant columns" goodX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "goodX:", goodX # intermittent fail on the forced params? for trial in range(10 if DO_FAIL_ONLY else 20): if DO_FAIL_ONLY: params = define_params_fail() else: # params is mutable. This is default. params = {'y': y, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1} h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_100Mx70_hosts(self): # enable this if you need to re-create the file if 1==0: SYNDATASETS_DIR = h2o.make_syn_dir() createList = [ (100000000, 70, 'cA', 10000), ] for (rowCount, colCount, hex_key, timeoutSecs) in createList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # Have to copy it to /home/0xdiag/datasets! # None is okay for hex_key csvFilenameList = [ # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'), # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'), ('rand_logreg_100000000x70.csv', 500, 'rand_100Mx70.hex'), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000, retryDelaySecs=5, initialDelaySecs=10, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) csvPathname = importFolderPath + "/" + csvFilename numRows = inspect['numRows'] numCols = inspect['numCols'] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) y = numCols - 1 kwargs = { 'family': 'binomial', 'link': 'logit', 'y': y, 'max_iter': 8, 'n_folds': 0, 'beta_epsilon': 1e-4, 'alpha': 0, 'lambda': 0 } for trial in range(3): start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_GLM_poisson_rand2(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*40) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM2_dest_key(self): h2o.beta_features = True print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = 'logreg' + '/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') for maxx in [6]: destination_key='GLM_model_python_0_default_0' # illegal to have output col in the ignored_cols! kwargs = { 'ignored_cols': '0', 'response': y, 'n_folds': 5, 'destination_key': destination_key, } glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) h2o_destination_key = glm['glm_model']['_key'] print 'h2o_destination_key:', h2o_destination_key self.assertEqual(h2o_destination_key, destination_key, msg='I said to name the key %s, h2o used %s' % (destination_key, h2o_destination_key)) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = 'logreg' + '/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=10) y = 10 # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'response': y, 'alpha': 0, 'family': 'binomial'} h2o.nodes[0].to_enum(src_key=parseResult['destination_key'], column_index=y+1) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? glm_model = glm['glm_model'] validation = glm_model['submodels'][0]['validation'] if self.validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, self.validation1) else: self.validation1 = copy.deepcopy(validation)
def test_NOPASS_GLM2_weight_nan_fail(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') kwargs = { 'destination_key': 'GLM_model_python_0_default_0', 'family': 'tweedie', 'tweedie_variance_power': 1.9999999, 'max_iter': 10, 'alpha': 0, 'lambda': 0, 'response': 54, } for trial in range(3): # params is mutable. This is default. start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) h2o.check_sandbox_for_errors() # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2grid_covtype_many(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'gaussian', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "glmResult:", h2o.dump_json(glmResult) # assuming it doesn't complete right away, this is the first response # it differs for the last response job_key = glmResult['job_key'] grid_key = glmResult['destination_key'] jobs.append( (job_key, grid_key) ) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(4): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start # 2/GLMGridView.html?grid_key=asd # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1 for job_key, grid_key in jobs: gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key) h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert_settings': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) y = "10" x = "" # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values 1,-1. need to specify case kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python)" h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_GLM2_model_key_unique(self): h2o.beta_features = True modelKeyDict = {} for trial in range (1,5): csvPathname = 'iris/iris2.csv' start = time.time() # make sure each parse is unique dest key (not in use hex_key = "iris2_" + str(trial) + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) y = 4 execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # h2o.py now sets destination_key for a fixed default model name, # we want h2o to create model names for this test, so use none here kwargs = {'destination_key': None, 'response':4, 'family': 'gaussian'} glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, noPoll=True, **kwargs ) print "GLM #%d" % trial, "started on ", csvPathname, 'took', time.time() - start, 'seconds' model_key = glmResult['destination_key'] print "GLM model_key:", model_key if model_key in modelKeyDict: raise Exception("same model_key used in GLM #%d that matches prior GLM #%d" % (trial, modelKeyDict[model_key])) modelKeyDict[model_key] = trial # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_GLM2_syn_2659x1049x2enum(self): csvFilename = "syn_2659x1049x2enum.csv" csvPathname = "logreg" + "/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") kwargs = params glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=240, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_GLM2_princeton(self): # filename, y, timeoutSecs # these are all counts? using gaussian? csvFilenameList = [ ('cuse.dat', 'gaussian', 3, 10), # notUsing ('cuse.dat', 'gaussian', 4, 10), # using ('copen.dat', 'gaussian', 4, 10), ('housing.raw', 'gaussian', 4, 10), ] trial = 0 for (csvFilename, family, y, timeoutSecs) in csvFilenameList: csvPathname1 = 'logreg/princeton/' + csvFilename fullPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_stripped.csv' h2o_util.file_strip_trailing_spaces(fullPathname1, csvPathname2) parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=timeoutSecs) start = time.time() kwargs = {'n_folds': 0, 'family': family, 'response': y} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time() - start, 'seconds' trial += 1 print "\nTrial #", trial
def test_C_prostate(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = 'logreg' + '/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) sys.stdout.write('.') sys.stdout.flush() h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_GLM2_tweedie(self): csvFilename = "AutoClaim.csv" csvPathname = 'standard/' + csvFilename print "\nStarting", csvPathname parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') # columns start at 0 # regress: glm(CLM_AMT ~ CAR_USE + REVOLKED + GENDER + AREA + MARRIED + CAR_TYPE, data=AutoClaim, family=tweedie(1.34)) coefs = [7, 13, 20, 27, 21, 11] y = 4 ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult['destination_key'], cols=coefs, response=y) # sapply(c('CLM_AMT', 'CAR_USE', 'REVOLKED', 'GENDER', 'AREA', 'MARRIED', 'CAR_TYPE'), function(x) which(x==colnames(AutoClaim)) - 1) kwargs = { 'family': 'tweedie', 'tweedie_variance_power': 1.36, 'response': y, 'ignored_cols' : ignored_cols, 'max_iter': 10, 'lambda': 0, 'alpha': 0, 'n_folds': 0, 'beta_epsilon': 1e-4, } glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) coefficientsExpected = {'Intercept': 0, 'GENDER.M': 0.0014842488782470984, 'CAR_TYPE.Sports Car': 0.07786742314454961, 'MARRIED.Yes': 0.0007748552195851079, 'CAR_TYPE.SUV': 0.07267702940249621, 'CAR_TYPE.Pickup': 0.04952083408742968, 'CAR_TYPE.Van': 0.026422137690691405, 'CAR_TYPE.Sedan': 0.05128350794060489, 'CAR_USE.Private': -0.03050194832853935, 'REVOLKED.Yes': -0.05095942737408699} deltaExpected = 0.05 (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, coefficientsExpected=coefficientsExpected, deltaExpected=deltaExpected, **kwargs) print 'coefficients: %s' % (str(coefficients))
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='covtype.hex') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10) glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_C_prostate_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") for maxx in range(2, 6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str, x)) print "\nx:", x print "y:", y kwargs = {"x": x, "y": y, "n_folds": 5} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, "AGE", **kwargs) GLMModel = glm["GLMModel"] modelKey = GLMModel["model_key"] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=parseResult["destination_key"], destination_key="Predict.hex" ) h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def process_dataset(self, parseResult, Y, e_coefs, e_ndev, e_rdev, e_aic, **kwargs): # no regularization kwargs['alpha'] = 0 kwargs['lambda'] = 0 kwargs['response'] = 'CAPSULE' glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, **kwargs) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response # the first submodel is the right one, if onely one lambda is provided as a parameter above glm_model = glmResult['glm_model'] submodels = glm_model['submodels'][0] validation = submodels['validation'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] errors = [] # FIX! our null deviance doesn't seem to match h2o.verboseprint("Comparing:", null_deviance, e_ndev) # if abs(float(nullDev) - e_ndev) > (0.001 * e_ndev): # errors.append('NullDeviance: %f != %s' % (e_ndev,nullDev)) # FIX! our res deviance doesn't seem to match h2o.verboseprint("Comparing:", residual_deviance, e_rdev) # if abs(float(resDev) - e_rdev) > (0.001 * e_rdev): # errors.append('ResDeviance: %f != %s' % (e_rdev,resDev)) # FIX! we don't have an AIC to compare? return errors
def test_GLM2_ints_unbalanced(self): h2o.beta_features = True ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'n_folds': 0, 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-5 }, # {'alpha': 0.25, 'lambda': 1e-4}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, vactual='C' + str(y + 1), vpredict=1, expectedAuc=0.6)
def test_GLM2_mnist_reals(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x GLM will use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x modelKey = "mnist" params = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey } # for c in [0,1,2,3,4,5,6,7,8,9]: # just do a couple digits for c in [0,7]: print "Trying binomial with case:", c execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) kwargs = params.copy() timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) # Score ********************************************** execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "Problems with test data having different enums than train? just use train for now" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key="B.hex", model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual="B.hex", vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_c7_rel(self): print "Running with h2o.beta_features=True for all" h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset # does the json fail with too many?? #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2) # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500) # can't do more than 1000 summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, # 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) # can't figure out how I'm supposed to get the model # GLMModel = glm['GLMModel'] # modelKey = GLMModel['model_key'] # glmView = h2o.nodes[0].glm_view(modelKey=modelKey) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_poisson_alirline87_s3n_thru_hdfs(self): bucket = 'h2o-airlines-unpacked' csvFilename = "year1987.csv" hex_key = "year1987.hex" csvPathname = csvFilename trialMax = 2 timeoutSecs = 500 for trial in range(trialMax): trialStart = time.time() hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=1200) elapsed = time.time() - start print hex_key, 'h2o reported parse time:', parseResult['response'][ 'time'] print "parse end on ", hex_key, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { # will fail if categorical is chosen # 'y': 'IsArrDelayed', 'y': 'CRSArrTime', 'x': '1,2,3,4,8,9,16,17,18,30', 'family': 'poisson', 'link': 'familyDefault', 'n_folds': 1, 'max_iter': 8, 'beta_epsilon': 1e-3 } timeoutSecs = 500 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm (Elastic) end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm (L1) end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_GLM2_basic(self): h2o.beta_features = True importFolderPath = "logreg" csvFilename = 'prostate.csv' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print inspect print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) x = 'ID' y = 'CAPSULE' family = 'binomial' alpha = '0.5' lambda_ = '1E-4' nfolds = '0' f = 'prostate' modelKey = 'GLM_' + f kwargs = { 'response': y, 'ignored_cols': x, 'family': family, 'lambda': lambda_, 'alpha': alpha, 'n_folds': nfolds, # passes if 0, fails otherwise 'destination_key': modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=0.25, pollTimeoutSecs=180, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) if 1 == 0: job_key = glmResult['job_key'] # is the job finishing before polling would say it's done? params = {'job_key': job_key, 'destination_key': modelKey} glm = h2o.nodes[0].completion_redirect( jsonRequest="2/GLMProgressPage2.json", params=params) print "GLM result from completion_redirect:", h2o.dump_json(a) if 1 == 1: glm = h2o.nodes[0].glm_view(_modelKey=modelKey) ### print "GLM result from glm_view:", h2o.dump_json(a) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) glm_model = glm['glm_model'] _names = glm_model['_names'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc
def test_GLM2_enums_score_superset(self): h2o.beta_features = True print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' kwargs = { 'destination_key': modelKey, 'response': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'family': 'binomial' } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual=scoreDataKey, predict='Predict.hex', vactual=y, vpredict=1) auc = resultAUC['AUC'] self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=scoreDataKey, predict=predictKey, vactual='C' + str(y + 1), vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def GLM_syn_eqns_data(self, ALGO='binomial', DATA_VALUE_MIN=-1, DATA_VALUE_MAX=1, COEFF_VALUE_MIN=-1, COEFF_VALUE_MAX=1, INTCPT_VALUE_MIN=-1, INTCPT_VALUE_MAX=1, DATA_DISTS='unique_pos_neg'): SYNDATASETS_DIR = h2o.make_syn_dir() if ALGO == 'poisson': tryList = [ (50000, 5, 'cD', 300), ] else: tryList = [ # (100, 1, 'cA', 300), # (100, 25, 'cB', 300), # (1000, 25, 'cC', 300), # 50 fails, 40 fails # (10000, 50, 'cD', 300), # 30 passes # (10000, 30, 'cD', 300), # 200 passed (500, 30, 'cD', 300), (5000, 30, 'cD', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: modeString = \ "_Bins" + str(BINS) + \ "_Dmin" + str(DATA_VALUE_MIN) + \ "_Dmax" + str(DATA_VALUE_MAX) + \ "_Cmin" + str(COEFF_VALUE_MIN) + \ "_Cmax" + str(COEFF_VALUE_MAX) + \ "_Imin" + str(INTCPT_VALUE_MIN) + \ "_Imax" + str(INTCPT_VALUE_MAX) + \ "_Ddist" + str(DATA_DISTS) print "modeString:", modeString SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + modeString + "_" + str( SEEDPERFILE) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, \ "using random coefficients and intercept and logit eqn. for output" (coefficientsGen, interceptGen) = gen_rand_equation( colCount, INTCPT_VALUE_MIN, INTCPT_VALUE_MAX, COEFF_VALUE_MIN, COEFF_VALUE_MAX, SEEDPERFILE) print coefficientsGen, interceptGen write_syn_dataset(csvPathname, rowCount, colCount, coefficientsGen, interceptGen, DATA_VALUE_MIN, DATA_VALUE_MAX, DATA_DISTS, ALGO, SEED) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=60) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount print "GLM is ignoring the thresholds I give it? deciding what's best?" kwargs = { 'standardize': 0, # link is default # 'link': 'family': ALGO, 'response': y, 'max_iter': 25, 'lambda': 0, 'alpha': 0, 'n_folds': 0, 'beta_epsilon': 1e-4, # 'thresholds': 0.5, } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 'C1', **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' if ALGO == 'binomial': deltaCoeff = 0.1 deltaIntcpt = 0.2 else: # poisson needs more? deltaCoeff = 0.5 deltaIntcpt = 1.0 for i, c in enumerate(coefficients): g = coefficientsGen[i] # generated print "coefficient[%d]: %8.4f, generated: %8.4f, delta: %8.4f" % ( i, c, g, abs(g - c)) self.assertAlmostEqual( c, g, delta=deltaCoeff, msg="not close enough. coefficient[%d]: %s, generated %s" % (i, c, g)) c = intercept g = interceptGen print "intercept: %8.4f, generated: %8.4f, delta: %8.4f" % ( c, g, abs(g - c)) print "need a larger delta compare for intercept?" self.assertAlmostEqual( c, g, delta=deltaIntcpt, msg="not close enough. intercept: %s, generated %s" % (c, g))
def test_GLM_hdfs_YearPredictionMSD(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] # a browser window too, just because we can ## h2b.browseTheCloud() validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: csvPathname = "datasets/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', timeoutSecs=60) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=500, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] if getpass.getuser() == 'kevin': # longer run tryList = [ (10000, 100, 'cA', 100), (10000, 300, 'cB', 300), (10000, 500, 'cC', 700), (10000, 700, 'cD', 3600), (10000, 900, 'cE', 3600), (10000, 1000, 'cF', 3600), (10000, 1300, 'cG', 3600), (10000, 1700, 'cH', 3600), (10000, 2000, 'cI', 3600), (10000, 2500, 'cJ', 3600), (10000, 3000, 'cK', 3600), ] else: tryList = [ (10000, 100, 'cA', 100), (10000, 300, 'cC', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) start = time.time() parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=30) elapsed = time.time() - start print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount # just limit to 2 iterations..assume it scales with more iterations kwargs = { 'y': y, 'max_iter': 2, 'case': 1, 'case_mode': '=', 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0.6, 'weight': 1.0, 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1e-4, } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations = glm['GLMModel']['iterations'] algo = "GLM " + str(iterations) + " iterations" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l)
def test_GLM_convergence_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, 'cD', 300), (100, 100, 'cE', 300), (100, 200, 'cF', 300), (100, 300, 'cG', 300), (100, 400, 'cH', 300), (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put') print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 10, 'lambda': 1e-8, 'alpha': 0.5, 'weight': 1.0, 'link': 'familyDefault', 'n_folds': 0, 'beta_epsilon': 1e-4, 'thresholds': '0:1:0.01', } kwargs['y'] = y emsg = None # FIX! how much should we loop here. for i in range(3): start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time( ) - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, allowFailWarning=True, **kwargs) if 1 == 0: print "\n", "\ncoefficients in col order:" # since we're loading the x50 file all the time..the real colCount # should be 50 (0 to 49) showCols = colCount for c in range(showCols): print "%s:\t%.6e" % (c, coefficients[c]) print "intercept:\t %.6e" % intercept # gets the failed to converge, here, after we see it in the browser too x = re.compile("[Ff]ailed") if warnings: for w in warnings: if (re.search(x, w)): # first if emsg is None: emsg = w print w if emsg: break if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def test_GLM2_covtype_1(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=10) print "Gratuitous use of frame splitting. result not used" fs = h2o.nodes[0].frame_split(source=hex_key, ratios=0.75) split0_key = fs['split_keys'][0] split1_key = fs['split_keys'][1] split0_row = fs['split_rows'][0] split1_row = fs['split_rows'][1] split0_ratio = fs['split_ratios'][0] split1_ratio = fs['split_ratios'][1] # print "\n" + csvPathname, \ # " num_rows:", "{:,}".format(inspect['num_rows']), \ # " num_cols:", "{:,}".format(inspect['num_cols']) x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = 54 modelKey = "GLMModel" kwargs = { # 'cols': x, # for 2 'response': 'C' + str(y+1), # for 2 'family': 'binomial', # 'link': 'logit', # 2 doesn't support 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'destination_key': modelKey } # maybe go back to simpler exec here. this was from when Exec failed unless this was used execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # class 1=1, all else 0 execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % (y+1, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'A.hex'} timeoutSecs = 120 # L2 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) def completionHack(jobKey, modelKey): if DO_POLL: # not needed pass else: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # print "FIX! how do we get the GLM result" params = {'_modelKey': modelKey} a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMModelView.json", params=params) # print "GLM result from completion_redirect:", h2o.dump_json(a) glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, **kwargs) completionHack(glmFirstResult['job_key'], modelKey) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' ## h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, **kwargs) completionHack(glmFirstResult['job_key'], modelKey) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' ## h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, **kwargs) completionHack(glmFirstResult['job_key'], modelKey) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_GLM2_mnist(self): if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % ( trainKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % ( testKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 9, "Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM_allstate_s3n_thru_hdfs(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'allstate' csvFilename = "train_set.csv" csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 500 trialMax = 3 for trial in range(trialMax): trialStart = time.time() hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse end on ", hex_key, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { # allstate claim last col 'y': 34, 'case_mode': '>', 'case': 0, 'family': 'binomial', 'link': 'logit', 'n_folds': 2, 'max_iter': 8, 'beta_epsilon': 1e-3 } timeoutSecs = 500 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noise=('JStack', None), **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noise=('JStack', None), **kwargs) elapsed = time.time() - start print "glm (Elastic) end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noise=('JStack', None), **kwargs) elapsed = time.time() - start print "glm (L1) end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) h2o.check_sandbox_for_errors() print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_GLM2_covtype20x_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', 'classification': 1, } timeoutSecs = 60 for trial in range(100): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def test_GLM1_GLM2_predict(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' y = 54 expectedPctWrong = 0 if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' y = 54 expectedPctWrong = 0 if 1 == 1: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'smalldata' # no header csvPathname = 'iris/iris.csv' hexKey = 'iris.hex' y = 4 expectedPctWrong = 26 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult['destination_key'] # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300) #************************************************************************** # first glm1 CLASS = 1 # try ignoring the constant col to see if it makes a diff kwargs = { 'lsm_solver': LSM_SOLVER, 'standardize': STANDARDIZE, 'y': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON, 'case': CLASS, 'case_mode': '=', } timeoutSecs = 120 kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm['GLMModel']['GLMParams']['family'] = FAMILY print "glm1 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm['GLMModel']['iterations'] err1 = glm['GLMModel']['validations'][0]['err'] nullDev1 = glm['GLMModel']['validations'][0]['nullDev'] resDev1 = glm['GLMModel']['validations'][0]['resDev'] if FAMILY == 'binomial': classErr1 = glm['GLMModel']['validations'][0]['classErr'] auc1 = glm['GLMModel']['validations'][0]['auc'] #************************************************************************** # then glm2 kwargs = { # 'ignored_cols': 'C29', 'standardize': STANDARDIZE, 'response': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON } timeoutSecs = 120 # class 1=1, all else 0 if FAMILY == 'binomial': execExpr = "B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % ( trainKey, y + 1, trainKey, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {'destination_key': 'B.hex'} else: bHack = parseResult kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) #************************************************************************** modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] iteration = submodels[-1]['iteration'] resDev = validation['residual_deviance'] nullDev = validation['null_deviance'] if FAMILY == 'binomial': auc = validation['auc'] self.assertLess(iterations1, MAX_ITER - 1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER - 1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, # msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected)) iterationExpected = iterations1 # self.assertAlmostEqual(iteration, iterationExpected, delta=2, # msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected)) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected)) self.assertTrue( h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5), msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected)) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected)) self.assertTrue( h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5), msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected)) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == 'binomial': aucExpected = auc1 self.assertAlmostEqual( auc, aucExpected, delta=10, msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected)) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * ( abs(intercept) - abs(interceptExpected)) / abs(interceptExpected) self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5), msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected)) # avg_errExpected = 0.2463 avg_errExpected = err1 # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, # msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected)) # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, # msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35)) #******************** # Print comparison #******************** interceptDelta = abs(abs(intercept1) - abs(intercept)) cDelta = [ abs(abs(a) - abs(b)) for a, b in zip(coefficients1, coefficients) ] def printit(self, a, b, c, d): pctDiff = abs(d / c) * 100 print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \ ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d) # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept") printit(self, "intercept", "", intercept1, interceptDelta) print "compare lengths coefficients1, coefficients, cDelta:", len( coefficients1), len(coefficients), len(cDelta) print "GLM1:", coefficients1 print "GLM2:", coefficients print "cDelta:", cDelta for i, cValue in enumerate(coefficients): printit(self, "coefficient", "C" + str(i), cValue, cDelta[i]) hexKey = 'B.hex' pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=2.0, msg= "predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
def test_GLM2_covtype_train_predict_all_all(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1) # class 1 h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "Use same data (full) for train and test" trainDataKey = "A.hex" testDataKey = "A.hex" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y + 1), 'max_iter': 20, 'n_folds': 0, # 'alpha': 0.1, # 'lambda': 1e-5, 'alpha': 0.0, 'lambda': None, 'family': 'binomial', } timeoutSecs = 60 for trial in range(1): # test/train split **********************************************8 aHack = {'destination_key': trainDataKey} # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertEqual( pctWrong, trainPctWrong, "Should see the same error rate on train and predict? (same data set)" ) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GLM_moneypuck(self): if 1 == 1: # None is okay for hex_key csvFilenameList = [ # ('hdb-2007-02-05/Goalies.csv',240,'Goalies'), # ('hdb-2007-02-05/GoaliesSC.csv',240,'GoaliesSC'), # ('hdb-2007-02-05/Master.csv',240,'Master'), ('hdb-2007-02-05/Scoring.csv', 240, 'Scoring'), ('hdb-2007-02-05/ScoringSC.csv', 240, 'ScoringSC'), ('hdb-2007-02-05/Teams.csv', 240, 'Teams'), ('hdb-2007-02-05/TeamsHalf.csv', 240, 'TeamsHalf'), ('hdb-2007-02-05/TeamsPost.csv', 240, 'TeamsPost'), ('hdb-2007-02-05/TeamsSC.csv', 240, 'TeamsSC'), ('tricks-2012-06-23/HatTricks.csv', 240, 'HatTricks'), ('bkb090621/abbrev.csv', 240, 'abbrev'), ('bkb090621/AwardsCoaches.csv', 240, 'AwardsCoaches'), ('bkb090621/AwardsPlayers.csv', 240, 'AwardsPlayers'), ('bkb090621/Coaches.csv', 240, 'Coaches'), # never finishes? # ('bkb090621/Draft.csv',240,'Draft'), # ('bkb090621/Master.csv',240,'Master'), ('bkb090621/PlayersAllstar.csv', 240, 'PlayersAllstar'), ('bkb090621/Players.csv', 240, 'Players'), ('bkb090621/PlayersPlayoffs.csv', 240, 'PlayersPlayoffs'), ('bkb090621/Teams.csv', 240, 'Teams'), ('hdb-2007-02-05/abbrev.csv', 240, 'abbrev'), # SPD without regularization # can't solve, when regularization added # ('hdb-2007-02-05/AwardsCoaches.csv',240,'AwardsCoaches'), # ('hdb-2007-02-05/AwardsMisc.csv',240,'AwardsMisc'), ('hdb-2007-02-05/AwardsPlayers.csv', 240, 'AwardsPlayers'), # can't solve, when regularization added # ('hdb-2007-02-05/Coaches.csv',240,'Coaches'), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "hockey" for csvFilename, timeoutSecs, hex_key in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, hex_key=hex_key) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numRows = inspect['numRows'] numCols = inspect['numCols'] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) max_iter = 9 # assume the last col is the output! y = numCols - 1 kwargs = { 'y': y, 'family': 'poisson', 'link': 'log', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } # L2 if 1 == 0: kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' # assume each one has a header and you have to indirect thru 'column_names' column_names = glm['GLMModel']['column_names'] print "column_names[0]:", column_names[0] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2b.browseJsonHistoryAsUrlLastMatch("GLM") # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_four_billion_rows_fvec(self): h2o.beta_features = True timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=3) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] byteSize = inspect['byteSize'] print "\n" + csvFilename, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols), \ " byteSize:", "{:,}".format(byteSize) expectedRowSize = numCols * 1 # plus output # expectedValueSize = expectedRowSize * numRows expectedValueSize = 8001271520 self.assertEqual(byteSize, expectedValueSize, msg='byteSize %s is not expected: %s' % \ (byteSize, expectedValueSize)) summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, numCols, msg="generated %s cols (including output). parsed to %s cols" % (2, numCols)) self.assertEqual(4 * 1000000000, numRows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, numRows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'max_iter': 10, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=4, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'response': 'C1', 'n_folds': 0, 'family': 'binomial', } # one coefficient is checked a little more colX = 1 # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('1', '1', 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
def sub_c3_nongz_fvec_long(self, csvFilenameList): # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # output 378 can't be in this ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 10, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial # execExpr="A.hex=%s" % parseResult['destination_key'] # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_GLM_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for hex_key csvFilenameList = [ ('covtype20x.data', 480, 'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can ### h2b.browseTheCloud() importFolderPath = "standard" for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, hex_key=hex_key) print "parse end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # this will make it fvec print "Touching %s with exec to make it fvec" % hex_key h2o_cmd.runExec(str='%s[0,]=%s[0,]' % (hex_key, hex_key)) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" x = "" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 1, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors() # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors() # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors()
def test_GLM_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-8 }, { 'alpha': 0.5, 'lambda': 0.0 }, { 'alpha': 0.0, 'lambda': 0.0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # if iterations > 20: # raise Exception("Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def sub_c2_rel_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 116561140 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath if len(h2o.nodes)==1: csvFilenameList= [ ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600), ] else: csvFilenameList= [ ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]: for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]: x.remove(i) ignore_x.append(i) # increment by one, because we are no long zero offset! x = ",".join(map(lambda x: "C" + str(x+1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'family': 'binomial', 'x': x, 'y': 'C379', 'case': 15, 'case_mode': '>', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_GLM_both(self): h2o.beta_features = True if (1==1): csvFilenameList = [ ('logreg', 'benign.csv', 'binomial', 3, 10), # col is zero based # FIX! what's wrong here? index error ## ('uis.dat', 'binomial', 8, 5, False), ## ('pros.dat', 'binomial', 1, 10, False), ## ('chdage.dat', 'binomial', 2, 5, True), ## ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ## ('clslowbwt.dat', 'binomial', 7, 10, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ] else: csvFilenameList = [ # leave out ID and birth weight ('logreg', 'benign.csv', 'gaussian', 3, 10), (None, 'icu.dat', 'binomial', 1, 10), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'lowbwt.dat', 'binomial', 1, 10), (None, 'lowbwtm11.dat', 'binomial', 1, 10), (None, 'meexp.dat', 'gaussian', 3, 10), # FIX! does this one hang in R? (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'pbc.dat', 'gaussian', 1, 10), (None, 'pharynx.dat', 'gaussian', 12, 10), (None, 'uis.dat', 'binomial', 8, 10), ] trial = 0 for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList: # FIX! do something about this file munging if offset: csvPathname1 = offset + "/" + csvFilename else: csvPathname1 = 'logreg/umass_statdata/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(fullPathname, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) if h2o.beta_features: num_cols = inspect['numCols'] num_rows = inspect['numRows'] else: num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): if csvFilename=='clslowbwt.dat' and c==6: print "Not including col 6 for this dataset from x" if csvFilename=='benign.csv' and (c==0 or c==1): print "Not including col 0,1 for this dataset from x" else: # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x if h2o.beta_features: kwargs = { 'n_folds': 0, 'response': y, # what about x? 'family': family, 'alpha': 0, 'lambda': 0, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } else: kwargs = { 'n_folds': 0, 'y': y, 'x': x, 'family': family, 'alpha': 0, 'lambda': 1e-4, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } if csvFilename=='benign.csv': kwargs['ignored_cols'] = '0,1' if csvFilename=='clslowbwt.dat': kwargs['ignored_cols'] = '6' start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial
def test_four_billion_rows(self): h2o.beta_features = False timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ "four_billion_rows.csv", ] for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, pollTimeoutSecs=180) elapsed = time.time() - start print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] # forget about checking the bytesize print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) expectedRowSize = num_cols * 1 # plus output # expectedValueSize = expectedRowSize * num_rows summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4 * 1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'cols': 'C1, C2', 'initialization': 'Furthest', 'max_iter': 4, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'y': 'C2', 'n_folds': 0, 'family': 'binomial', 'case_mode': '=', 'case': 1 } # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, 'C1', **kwargs)
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack','iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10) # java_extra_args=jea, # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandboxIgnoreErrors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3') else: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs') foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?") src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] src_key = URI + csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]: x.remove(i) x = ",".join(map(str,x)) if DO_GLM: algo = 'GLM' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_c9_GLM_airlines_fvec(self): h2o.beta_features = True files = [('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 30, 'n_folds': 3, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, 'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if h2o.beta_features: modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_four_billion_rows(self): timeoutSecs = 1500 importFolderPath = "billions" csvFilenameList = [ ("four_billion_rows.csv", "a.hex"), ("four_billion_rows.csv", "b.hex"), ] for (csvFilename, hex_key) in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() # Parse********************************* parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # Inspect********************************* # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] value_size_bytes = inspect['value_size_bytes'] row_size = inspect['row_size'] print "\n" + csvFilename, \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols), \ " value_size_bytes:", "{:,}".format(value_size_bytes), \ " row_size:", "{:,}".format(row_size) expectedRowSize = num_cols * 1 # plus output expectedValueSize = expectedRowSize * num_rows self.assertEqual(row_size, expectedRowSize, msg='row_size %s is not expected num_cols * 1 byte: %s' % \ (row_size, expectedRowSize)) self.assertEqual(value_size_bytes, expectedValueSize, msg='value_size_bytes %s is not expected row_size * rows: %s' % \ (value_size_bytes, expectedValueSize)) summaryResult = h2o_cmd.runSummary( key=parseResult['destination_key'], timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( 2, num_cols, msg="generated %s cols (including output). parsed to %s cols" % (2, num_cols)) self.assertEqual(4 * 1000000000, num_rows, msg="generated %s rows, parsed to %s rows" % (4 * 1000000000, num_rows)) # KMeans********************************* kwargs = { 'k': 3, 'initialization': 'Furthest', 'epsilon': 1e-6, 'max_iter': 20, 'cols': None, 'normalize': 0, 'destination_key': 'junk.hex', 'seed': 265211114317615310, } timeoutSecs = 900 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # GLM********************************* print "\n" + csvFilename kwargs = { 'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1 } # one coefficient is checked a little more colX = 0 # L2 timeoutSecs = 900 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
def do_h2o_glm(self, bucket, csvPathname, L, family='binomial'): h2p.red_print("\nNow doing h2o") h2o.beta_features=True parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', timeoutSecs=180) # save the resolved pathname for use in the sklearn csv read below inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print inspect print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) x = 'ID' y = 'CAPSULE' family = family alpha = '0' lambda_ = L nfolds = '0' f = 'prostate' modelKey = 'GLM_' + f kwargs = { 'response' : y, 'ignored_cols' : x, 'family' : family, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, # passes if 0, fails otherwise 'destination_key' : modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o alpha ", alpha) h2p.green_print("h2o lambda ", lambda_) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response glm_model = glmResult['glm_model'] _names = glm_model['_names'] coefficients_names = glm_model['coefficients_names'] # the first submodel is the right one, if onely one lambda is provided as a parameter above submodels = glm_model['submodels'][0] beta = submodels['beta'] h2p.red_print("beta:", beta) norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc
def test_benchmark_import(self): covtype200xSize = 15033863400 csvFilenameList = [ ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), ] trialMax = 1 base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap / 2, base_port=base_port, enable_benchmark_log=True) for trial in range(trialMax): csvPathname = importFolderPath + "/" + csvFilepattern h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseResult['response'][ 'time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseResult) # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseResult['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRF takes the parseResult directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern #********************************************************************************** if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(54) # don't include the output column x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 54, 'case': 1, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o.tear_down_cloud() sys.stdout.write('.') sys.stdout.flush()
def test_GLM2_covtype20x_1(self): csvFilenameList = [ ('covtype20x.data', 800), ] # a browser window too, just because we can # h2b.browseTheCloud() importFolderPath = 'standard' for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename hex_key = "A.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = 54 kwargs = { 'response': 'C' + str(y + 1), # for 2 'family': 'binomial', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, # 'destination_key': modelKey } execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % (y + 1, y + 1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'A.hex'} # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (Elastic) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs) # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L1) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, "C14", **kwargs)