def runScore(node=None, dataKey=None, modelKey=None, predictKey='Predict.hex', vactual='C1', vpredict=1, expectedAuc=None, expectedAucTol=0.15, doAUC=True, timeoutSecs=200): # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = runPredict(data_key=dataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # inspect = runInspect(key=dataKey) # print dataKey, dump_json(inspect) # just get a predict and AUC on the same data. has to be binomial result if doAUC: resultAUC = h2o_nodes.nodes[0].generate_auc(thresholds=None, actual=dataKey, predict='Predict.hex', vactual=vactual, vpredict=vpredict) auc = resultAUC['aucdata']['AUC'] if expectedAuc: h2o_util.assertApproxEqual( auc, expectedAuc, tol=expectedAucTol, msg="actual auc: %s not close enough to %s" % (auc, expectedAuc)) # don't do this unless binomial predictCMResult = h2o_nodes.nodes[0].predict_confusion_matrix( actual=dataKey, predict=predictKey, vactual=vactual, vpredict='predict', ) # print "cm", dump_json(predictCMResult) # These will move into the h2o_gbm.py # if doAUC=False, means we're not binomial, and the cm is not what we expect if doAUC: cm = predictCMResult['cm'] pctWrong = h2o_gbm.pp_cm_summary(cm) print h2o_gbm.pp_cm(cm) return predictCMResult
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] if h2o.beta_features: # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult else: ntree = trainResult['ntree'] rfModelKey = trainResult['model_key'] start = time.time() # NOTE: response_variable is required, and passed from kwargs here # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set # compared to training kwargs['out_of_bag_error_estimate'] = 0 scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs = 1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=importFolderPath + "/" + csvFilename, schema="put", hex_key=trainKey, timeoutSecs=timeoutSecs, ) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** modelKey = "GBM_model" params = { "classification": 1, # faster? "destination_key": modelKey, "learn_rate": 0.1, "ntrees": 3, "max_depth": 8, "min_rows": 1, "response": 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 # noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView["gbm_model"]["cms"][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs=1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** modelKey = "GBM_model" params = { 'classification': 1, # faster? 'destination_key': modelKey, 'learn_rate': .1, 'ntrees': 3, 'max_depth': 8, 'min_rows': 1, 'response': 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 #noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cms = gbmTrainView['gbm_model']['cms'] cm = cms[-1]['_arr'] # use the last one print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr'] print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_GBM_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = [ 'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL', 'GLEASON' ] modelKey = 'GBM_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'validation': parseResult['destination_key'], 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': 10, 'max_depth': 20, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs'])
def test_GBM_basic_benign(self): h2o.beta_features = True csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one # fails with n_folds # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] modelKey = 'GBM_benign' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'validation': parseResult['destination_key'], 'ignored_cols_by_name': 'STR', 'learn_rate': .1, 'ntrees': 10, 'max_depth': 20, 'min_rows': 1, 'response': 'FNDX', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: print h2o.dump_json(gbmTrainView['gbm_model']['cms'][-1]) cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']# use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] if h2o.beta_features: # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult else: ntree = trainResult['ntree'] rfModelKey = trainResult['model_key'] start = time.time() # NOTE: response_variable is required, and passed from kwargs here # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set # compared to training kwargs['out_of_bag_error_estimate'] = 0 scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def runScore(node=None, dataKey=None, modelKey=None, predictKey='Predict.hex', vactual='C1', vpredict=1, expectedAuc=None, doAUC=True, timeoutSecs=200): # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = runPredict( data_key=dataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # inspect = runInspect(key=dataKey) # print dataKey, dump_json(inspect) # just get a predict and AUC on the same data. has to be binomial result if doAUC: resultAUC = h2o_nodes.nodes[0].generate_auc( thresholds=None, actual=dataKey, predict='Predict.hex', vactual=vactual, vpredict=vpredict) auc = resultAUC['aucdata']['AUC'] if expectedAuc: h2o_util.assertApproxEqual(auc, expectedAuc, tol=0.15, msg="actual auc: %s not close enough to %s" % (auc, expectedAuc)) # don't do this unless binomial predictCMResult = h2o_nodes.nodes[0].predict_confusion_matrix( actual=dataKey, predict=predictKey, vactual=vactual, vpredict='predict', ) # print "cm", dump_json(predictCMResult) # These will move into the h2o_gbm.py # if doAUC=False, means we're not binomial, and the cm is not what we expect if doAUC: cm = predictCMResult['cm'] pctWrong = h2o_gbm.pp_cm_summary(cm); print h2o_gbm.pp_cm(cm) return predictCMResult
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def test_GBM_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBM_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'validation': parseResult['destination_key'], 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': 10, 'max_depth': 20, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_GLM2_big1_nopoll(self): h2o.beta_features = True csvPathname = 'hhp_107_01.data.gz' print "\n" + csvPathname y = "106" x = "" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(5): kwargs = {'response': y, 'n_folds': 1, 'family': 'binomial'} # FIX! what model keys do these get? glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for g in glmInitial: print "Checking completed job, with no polling using initial response:" # this format is only in the first glm response (race?) modelKey = g['destination_key'] glm = h2o.nodes[0].glm_view(_modelKey=modelKey) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm)
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult rftime = time.time()-start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: tryList = [(10000, 100, "cA", 300)] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, "cD", 300), (10000, 200, "cE", 300), (10000, 300, "cF", 300), (10000, 400, "cG", 300), (10000, 500, "cH", 300), (10000, 1000, "cI", 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" hdrFilename = "hdr_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename hdrPathname = SYNDATASETS_DIR + "/" + hdrFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** h2o.beta_features = False # turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = "GBMModelKey" # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse( bucket=None, path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False, ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult["destination_key"] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "train parse result:", parseTrainResult["destination_key"] # Logging to a benchmark file algo = "Parse" l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed ) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult["destination_key"]) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) num_rows = inspect["num_rows"] num_cols = inspect["num_cols"] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** ntrees = 5 prefixList = ["A", "B", "C", "D", "E", "F", "G", "H"] # for max_depth in [5,10,20,40]: for max_depth in [5, 10, 20]: # PARSE a new header**************************************** print "Creating new header", hdrPathname prefix = prefixList.pop(0) write_syn_header(hdrPathname, rowCount, colCount, prefix) # upload and parse the header to a hex h2o.beta_features = False # can't put with fvec yet hdr_hex_key = prefix + "_hdr.hex" parseHdrResult = h2i.import_parse( bucket=None, path=hdrPathname, schema="put", header=1, # REQUIRED! otherwise will interpret as enums hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False, ) # Set Column Names (before autoframe is created) h2o.nodes[0].set_column_names(target=hex_key, copy_from=hdr_hex_key) # GBM print "The response col name is changing each iteration, since we're parsing a new header" params = { "learn_rate": 0.2, "nbins": 1024, "ntrees": ntrees, "max_depth": max_depth, "min_rows": 10, "response": prefix + "_response", "ignored_cols_by_name": None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM( parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed ) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView["gbm_model"]["cms"][-1]["_arr"] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # works if you delete the autoframe ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe') h2o.beta_features = False # just plot the last one if DO_PLOT: xLabel = "max_depth" eLabel = "pctWrong" fLabel = "trainElapsed" eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if h2o.localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs): # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter # h2o GLM will verboseprint the result and print errors. # so don't have to do that # different when cross validation is used? No trainingErrorDetails? GLMModel = glm['glm_model'] if not GLMModel: raise Exception("GLMModel didn't exist in the glm response? %s" % h2o.dump_json(glm)) warnings = None if 'warnings' in GLMModel and GLMModel['warnings']: warnings = GLMModel['warnings'] # stop on failed x = re.compile("failed", re.IGNORECASE) # don't stop if fail to converge c = re.compile("converge", re.IGNORECASE) for w in warnings: print "\nwarning:", w if re.search(x,w) and not allowFailWarning: if re.search(c,w): # ignore the fail to converge warning now pass else: # stop on other 'fail' warnings (are there any? fail to solve? raise Exception(w) # for key, value in glm.iteritems(): print key # not in GLMGrid? # FIX! don't get GLMParams if it can't solve? GLMParams = GLMModel['glm'] family = GLMParams["family"] # number of submodels = number of lambda # min of 2. lambda_max is first submodels = GLMModel['submodels'] # since all our tests?? only use one lambda, the best_lamda_idx should = 1 best_lambda_idx = GLMModel['best_lambda_idx'] print "best_lambda_idx:", best_lambda_idx lambda_max = GLMModel['lambda_max'] print "lambda_max:", lambda_max # currently lambda_max is not set by tomas. ..i.e.not valid if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value): raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value)) # submodels0 = submodels[0] # submodels1 = submodels[-1] # hackery to make it work when there's just one if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels))) if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels))) submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one iterations = submodels1['iteration'] print "GLMModel/iterations:", iterations # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter if maxExpectedIterations is not None and iterations > maxExpectedIterations: raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) ) if 'validation' not in submodels1: raise Exception("Should be a 'validation' key in submodels1: %s" % h2o.dump_json(submodels1)) validationsList = submodels1['validation'] validations = validationsList # xval. compare what we asked for and what we got. n_folds = kwargs.setdefault('n_folds', None) print "GLMModel/validations" validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance']) validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance']) print "%15s %s" % ("null_deviance:\t", validations['null_deviance']) print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance']) # threshold only there if binomial? # auc only for binomial if family=="binomial": print "%15s %s" % ("auc:\t", validations['auc']) best_threshold = validations['best_threshold'] thresholds = validations['thresholds'] print "%15s %s" % ("best_threshold:\t", best_threshold) # have to look up the index for the cm, from the thresholds list best_index = None # FIX! best_threshold isn't necessarily in the list. jump out if >= for i,t in enumerate(thresholds): if t >= best_threshold: # ends up using next one if not present best_index = i break assert best_index!=None, "%s %s" % (best_threshold, thresholds) print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1] submodels = glm['glm_model']['submodels'] cms = submodels[0]['validation']['_cms'] assert best_index<len(cms), "%s %s" % (best_index, len(cms)) # if we want 0.5..rounds to int # mid = len(cms)/2 # cm = cms[mid] cm = cms[best_index] print "cm:", h2o.dump_json(cm['_arr']) predErr = cm['_predErr'] classErr = cm['_classErr'] # compare to predErr pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); print "predErr:", predErr print "calculated pctWrong from cm:", pctWrong print "classErr:", classErr # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) if family=="poisson" or family=="gaussian": print "%15s %s" % ("aic:\t", validations['aic']) coefficients_names = GLMModel['coefficients_names'] # print "coefficients_names:", coefficients_names idxs = submodels1['idxs'] print "idxs:", idxs coefficients_names = coefficients_names # always check both normalized and normal coefficients norm_beta = submodels1['norm_beta'] # if norm_beta and len(coefficients_names)!=len(norm_beta): # print len(coefficients_names), len(norm_beta) # raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta)) # beta = submodels1['beta'] # print "beta:", beta # if len(coefficients_names)!=len(beta): # print len(coefficients_names), len(beta) # raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta)) # test wants to use normalized? if doNormalized: beta_used = norm_beta else: beta_used = beta coefficients = {} # create a dictionary with name, beta (including intercept) just like v1 for i,b in zip(idxs, beta_used[:-1]): name = coefficients_names[i] coefficients[name] = b print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used) print "coefficients:", coefficients print "beta:", beta print "norm_beta:", norm_beta coefficients['Intercept'] = beta_used[-1] print "len(coefficients_names)", len(coefficients_names) print "len(idxs)", len(idxs) print "idxs[-1]", idxs[-1] print "intercept demapping info:", \ "coefficients_names[-i]:", coefficients_names[-1], \ "idxs[-1]:", idxs[-1], \ "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \ "beta_used[-1]:", beta_used[-1], \ "coefficients['Intercept']", coefficients['Intercept'] # last one is intercept interceptName = coefficients_names[idxs[-1]] if interceptName != "Intercept" or abs(beta_used[-1])<1e-26: raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\ (idxs[-1], beta_used[-1], "-"+interceptName+"-")) # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names # new 5/28/14. glm can point to zero coefficients # for i in idxs: # if beta_used[i]==0.0: ## raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i])) if len(idxs) > len(beta_used): raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used))) intercept = coefficients.pop('Intercept', None) # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099 # the last one shoudl be 'Intercept' ? coefficients_names.pop() # have to skip the output col! get it from kwargs # better always be there! y = kwargs['response'] # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names' # from the response # Tomas created 'coefficients_names which is the coefficient list in order. # Just use it to index coefficients! works for header or no-header cases # I guess now we won't print the "None" cases for dropped columns (constant columns!) # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before # he gets it? def add_to_coefficient_list_and_string(c, cList, cString): if c in coefficients: cValue = coefficients[c] cValueString = "%s: %.5e " % (c, cValue) else: print "Warning: didn't see '" + c + "' in json coefficient response.",\ "Inserting 'None' with assumption it was dropped due to constant column)" cValue = None cValueString = "%s: %s " % (c, cValue) cList.append(cValue) # we put each on newline for easy comparison to R..otherwise keep condensed if prettyPrint: cValueString = "H2O coefficient " + cValueString + "\n" # not mutable? return cString + cValueString # creating both a string for printing and a list of values cString = "" cList = [] # print in order using col_names # coefficients_names is input only now..same for header or no header, or expanded enums for c in coefficients_names: cString = add_to_coefficient_list_and_string(c, cList, cString) if prettyPrint: print "\nH2O intercept:\t\t%.5e" % intercept print cString else: if not noPrint: print "\nintercept:", intercept, cString print "\nTotal # of coefficients:", len(coefficients_names) # pick out the coefficent for the column we enabled for enhanced checking. Can be None. # FIX! temporary hack to deal with disappearing/renaming columns in GLM if (not allowZeroCoeff) and (colX is not None): absXCoeff = abs(float(coefficients[str(colX)])) self.assertGreater(absXCoeff, 1e-26, ( "abs. value of GLM coefficients['" + str(colX) + "'] is " + str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) )) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ( "abs. value of GLM coefficients['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" )) # this is good if we just want min or max # maxCoeff = max(coefficients, key=coefficients.get) # for more, just invert the dictionary and ... if (len(coefficients)>0): maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey] minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1] print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey] else: print "Warning, no coefficients returned. Must be intercept only?" # many of the GLM tests aren't single column though. # quick and dirty check: if all the coefficients are zero, # something is broken # intercept is in there too, but this will get it okay # just sum the abs value up..look for greater than 0 # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff? if (not allowZeroCoeff) and (len(coefficients)>1): s = 0.0 for c in coefficients: v = coefficients[c] s += abs(float(v)) self.assertGreater(s, 1e-26, ( "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26" )) print "submodels1, run_time (milliseconds):", submodels1['run_time'] # shouldn't have any errors h2o.check_sandbox_for_errors() return (warnings, cList, intercept)
def test_GBM_with_cancels(self): print "do import/parse with VA" h2o.beta_features = False importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False ) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult[ 'destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath == 'manyfiles-nflx-gz': if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response + 1, response + 1) kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now h2o.beta_features = True s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response ]: # have to add 1 for col start with 1, now. plus the C xIgnore.append("C" + str(i + 1)) else: # leave one col ignored, just to see? xIgnore = 'C1' modelKey = "GBMGood" params = { 'destination_key': modelKey, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response + 1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(15): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(i) + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs']) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_GBM_params_rand2(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C54', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = {'response': 'C54', 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0} h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_c10_rel_gbm(self): print "not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Test*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' testFilename = 'classification1Test.txt' testPathname = importFolderPath + "/" + testFilename start = time.time() parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' trainFilename = 'classification1Train.txt' trainPathname = importFolderPath + "/" + trainFilename start = time.time() parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500) print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, trainPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # GBM Train*********************************************************** x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] # response = 0 # doesn't work if index is used? response = 'outcome' # x = range(inspect['num_cols']) # del x[response] ntrees = 10 # fails with 40 params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': 20, 'min_rows': 2, 'response': response, 'cols': x, # 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() modelKey = 'GBMModelKey' timeoutSecs = 900 trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # get the last cm cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_params_rand2(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': '0,1,2,3,4', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cms'][-1] # use the last one # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_mnist(self): if not SCIPY_INSTALLED: pass else: h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilelist = [ (10000, 500, 'cA', 60), ] trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, rowCount, colCount) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** modelKey = 'GLM_model' y = colCount kwargs = { 'response': 'C' + str(y+1), 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0, 'max_iter': 15, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } # GLM wants the output col to be strictly 0,1 integer execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (hex_key, y+1, y+1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'aHack'} timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? lambdaMax = glm['glm_model']['lambda_max'] print "lambdaMax:", lambdaMax best_threshold= glm['glm_model']['submodels'][0]['validation']['best_threshold'] print "best_threshold", best_threshold # pick the middle one? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above print "\nPredict\n==========\n" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key='aHack', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='aHack', vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 50,"Should see less than 50% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_c9_GLM_airlines_hdfs(self): files = [ ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 3, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, 'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1) ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response+1), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response+1), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_covtype_train_predict_all_all(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1) # class 1 h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "Use same data (full) for train and test" trainDataKey = "A.hex" testDataKey = "A.hex" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y + 1), 'max_iter': 20, 'n_folds': 0, # 'alpha': 0.1, # 'lambda': 1e-5, 'alpha': 0.0, 'lambda': None, 'family': 'binomial', } timeoutSecs = 60 for trial in range(1): # test/train split **********************************************8 aHack = {'destination_key': trainDataKey} # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertEqual( pctWrong, trainPctWrong, "Should see the same error rate on train and predict? (same data set)" ) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GBM_with_cancels(self): print "do import/parse with VA" h2o.beta_features = False importFolderPath = "standard" timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only( bucket="home-0xdiag-datasets", path=csvPathname, schema="local", timeoutSecs=50 ) parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="local", hex_key="c.hex", timeoutSecs=500, noPoll=False, doSummary=False, ) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {"destination_key": "c.hex"} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult["destination_key"] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath == "manyfiles-nflx-gz": if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = "c.hex[,%s]=c.hex[,%s]>15" % (response + 1, response + 1) kwargs = {"str": execExpr} resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now h2o.beta_features = True s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]: # have to add 1 for col start with 1, now. plus the C xIgnore.append("C" + str(i + 1)) else: # leave one col ignored, just to see? xIgnore = "C1" modelKey = "GBMGood" params = { "destination_key": modelKey, "ignored_cols_by_name": xIgnore, "learn_rate": 0.1, "ntrees": 2, "max_depth": 8, "min_rows": 1, "response": "C" + str(response + 1), "classification": 1 if DO_CLASSIFICATION else 0, "grid_parallelism": 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(20): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs["destination_key"] = "GBMBad" + str(i) + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult["job_key"]) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern="GBMGood", timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView["gbm_model"]["cms"][-1]["_arr"] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"]) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_RF_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), # (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'RFModelKey' # Parse (train)**************************************** start = time.time() parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # RF(train iterate)**************************************** ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'nbins': 1024, 'classification': 1, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(numCols - 1), 'ignored_cols_by_name': None, } print "Using these parameters for RF: ", params kwargs = params.copy() trainStart = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str( max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) errsLast = rfResult['drf_model']['errs'][-1] print "RF 'errsLast'", errsLast cm = rfResult['drf_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1 == 1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_NN_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'standard/covtype.shuffled.90pct.data' csvPathname_test = 'standard/covtype.shuffled.10pct.data' hex_key = 'covtype.hex' validation_key = 'covtype.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'adaptive_rate': 0, 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 96.0, 'destination_key': model_key, 'validation': validation_key, 'score_interval': 10000 } expectedErr = 0.24 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_DeepLearning_mnist(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '1024,1024,2048', 'adaptive_rate' : 1, 'rho' : 0.99, 'epsilon' : 1e-8, 'train_samples_per_iteration' : -1, ## 0: better accuracy! -1: best scalability! 10000: best accuracy? # 'rate' : 0.01, # 'rate_annealing' : 1e-6, # 'momentum_start' : 0.5, # 'momentum_ramp' : 1800000, # 'momentum_stable' : 0.99, 'l1' : 1e-5, 'l2' : 0.0, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', 'epochs' : 128, #enough for 64 nodes 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 #don't score until the end } timeoutSecs = 7200 start = time.time() deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } h2o.beta_features = True predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) h2o.beta_features = False
def test_RF_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'RFModelKey' # Parse (train)**************************************** start = time.time() parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # RF(train iterate)**************************************** ntrees = 10 for max_depth in [5,10,20,40]: params = { 'nbins': 1024, 'classification': 1, 'ntrees': ntrees, 'max_depth': max_depth, 'response': 'C' + str(numCols-1), 'ignored_cols_by_name': None, } print "Using these parameters for RF: ", params kwargs = params.copy() trainStart = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) rfResult["drf_model"] = rfResult.pop("speedrf_model") errsLast = rfResult['drf_model']['errs'][-1] print "RF 'errsLast'", errsLast cm = rfResult['drf_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1==1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c9_GLM_rc_fvec(self): h2o.beta_features = True files = [ ('c16', '140k_train_anonymised.csv', 'rc.hex', 1800, None) ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() # avoid printing the coefficient names in jenkins output # the last col is the response, so we use a number to point to it below parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path=csvPathname, schema='local', hex_key=trainKey, header=0, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numRows = inspect['numRows'] numCols = inspect['numCols'] response = numCols-1 # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 0, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if h2o.beta_features: modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
def test_GLM2_covtype20x_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', 'classification': 1, } timeoutSecs = 60 for trial in range(100): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def test_DeepLearning_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A','B',0, 14), ('A','B',1, 14), (0,1,0, 12), (0,1,1, 12), (0,1,'NaN', 12), (1,0,'NaN', 12), (-1,1,0, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' validation_key = hex_key parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] response = 'C' + str(response) kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden' : '113,71,54', 'rate' : 0.01, 'rate_annealing' : 1e-6, 'momentum_start' : 0, 'momentum_stable' : 0, 'l1' : 0.0, 'l2' : 1e-6, 'seed' : 80023842348, 'loss' : 'CrossEntropy', #'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 100, 'destination_key' : model_key, 'validation' : hex_key, } timeoutSecs = 60 start = time.time() h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time() - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.00 relTol = 0.01 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr)) trial += 1
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(num_cols) del x[response] ignored_cols_by_name = ",".join(map(str,random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': ignored_cols_by_name, } if FORCE_FAIL_CASE: params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} ### print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_mnist(self): if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % ( trainKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % ( testKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 9, "Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex' ) ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs): # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter # h2o GLM will verboseprint the result and print errors. # so don't have to do that # different when cross validation is used? No trainingErrorDetails? if h2o.beta_features: GLMModel = glm['glm_model'] else: GLMModel = glm['GLMModel'] if not GLMModel: raise Exception("GLMModel didn't exist in the glm response? %s" % h2o.dump_json(glm)) warnings = None if 'warnings' in GLMModel and GLMModel['warnings']: warnings = GLMModel['warnings'] # stop on failed x = re.compile("failed", re.IGNORECASE) # don't stop if fail to converge c = re.compile("converge", re.IGNORECASE) for w in warnings: print "\nwarning:", w if re.search(x, w) and not allowFailWarning: if re.search(c, w): # ignore the fail to converge warning now pass else: # stop on other 'fail' warnings (are there any? fail to solve? raise Exception(w) # for key, value in glm.iteritems(): print key # not in GLMGrid? # FIX! don't get GLMParams if it can't solve? if h2o.beta_features: GLMParams = GLMModel['glm'] else: GLMParams = GLMModel["GLMParams"] family = GLMParams["family"] if h2o.beta_features: # number of submodels = number of lambda # min of 2. lambda_max is first submodels = GLMModel['submodels'] lambdas = GLMModel['lambdas'] # since all our tests?? only use one lambda, the best_lamda_idx should = 1 best_lambda_idx = GLMModel['best_lambda_idx'] print "best_lambda_idx:", best_lambda_idx lambda_max = GLMModel['lambda_max'] print "lambda_max:", lambda_max # currently lambda_max is not set by tomas. ..i.e.not valid if 1 == 0 and lambda_max <= lambdas[best_lambda_idx]: raise Exception( "lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, lambdas[best_lambda_idx])) # submodels0 = submodels[0] # submodels1 = submodels[-1] # hackery to make it work when there's just one if (best_lambda_idx >= len(lambdas)) or (best_lambda_idx < 0): raise Exception( "best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(lambdas))) if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception( "best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels))) submodels1 = submodels[ best_lambda_idx] # hackery to make it work when there's just one iterations = submodels1['iteration'] else: iterations = GLMModel['iterations'] print "GLMModel/iterations:", iterations # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter if maxExpectedIterations is not None and iterations > maxExpectedIterations: raise Exception( "Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations)) if h2o.beta_features: if 'validation' not in submodels1: raise Exception("Should be a 'validation' key in submodels1: %s" % h2o.dump_json(submodels1)) validationsList = submodels1['validation'] validations = validationsList else: # pop the first validation from the list if 'validations' not in GLMModel: raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel)) validationsList = GLMModel['validations'] # don't want to modify validationsList in case someone else looks at it validations = validationsList[0] # xval. compare what we asked for and what we got. n_folds = kwargs.setdefault('n_folds', None) # not checked in v2? if not h2o.beta_features: if not 'xval_models' in validations: if n_folds > 1: raise Exception( "No cross validation models returned. Asked for " + n_folds) else: xval_models = validations['xval_models'] if n_folds and n_folds > 1: if len(xval_models) != n_folds: raise Exception( len(xval_models) + " cross validation models returned. Asked for " + n_folds) else: # should be default 10? if len(xval_models) != 10: raise Exception( str(len(xval_models)) + " cross validation models returned. Default should be 10" ) if h2o.beta_features: print "GLMModel/validations" validations['null_deviance'] = h2o_util.cleanseInfNan( validations['null_deviance']) validations['residual_deviance'] = h2o_util.cleanseInfNan( validations['residual_deviance']) print "%15s %s" % ("null_deviance:\t", validations['null_deviance']) print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance']) else: print "GLMModel/validations" validations['err'] = h2o_util.cleanseInfNan(validations['err']) validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev']) validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev']) print "%15s %s" % ("err:\t", validations['err']) print "%15s %s" % ("nullDev:\t", validations['nullDev']) print "%15s %s" % ("resDev:\t", validations['resDev']) # threshold only there if binomial? # auc only for binomial if family == "binomial": print "%15s %s" % ("auc:\t", validations['auc']) if h2o.beta_features: best_threshold = validations['best_threshold'] thresholds = validations['thresholds'] print "%15s %s" % ("best_threshold:\t", best_threshold) # have to look up the index for the cm, from the thresholds list best_index = None # FIX! best_threshold isn't necessarily in the list. jump out if >= for i, t in enumerate(thresholds): if t >= best_threshold: # ends up using next one if not present best_index = i break assert best_index != None, "%s %s" % (best_threshold, thresholds) print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1] submodels = glm['glm_model']['submodels'] cms = submodels[0]['validation']['_cms'] assert best_index < len(cms), "%s %s" % (best_index, len(cms)) # if we want 0.5..rounds to int # mid = len(cms)/2 # cm = cms[mid] cm = cms[best_index] print "cm:", h2o.dump_json(cm['_arr']) predErr = cm['_predErr'] classErr = cm['_classErr'] # compare to predErr pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) print "predErr:", predErr print "calculated pctWrong from cm:", pctWrong print "classErr:", classErr # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) else: print "%15s %s" % ("threshold:\t", validations['threshold']) if family == "poisson" or family == "gaussian": print "%15s %s" % ("aic:\t", validations['aic']) if not h2o.beta_features: if math.isnan(validations['err']): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err']) raise Exception(emsg) if math.isnan(validations['resDev']): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validations['resDev']) raise Exception(emsg) # legal? if math.isnan(validations['nullDev']): pass # get a copy, so we don't destroy the original when we pop the intercept if h2o.beta_features: coefficients_names = GLMModel['coefficients_names'] # print "coefficients_names:", coefficients_names idxs = submodels1['idxs'] print "idxs:", idxs column_names = coefficients_names # always check both normalized and normal coefficients norm_beta = submodels1['norm_beta'] # if norm_beta and len(column_names)!=len(norm_beta): # print len(column_names), len(norm_beta) # raise Exception("column_names and normalized_norm_beta from h2o json not same length. column_names: %s normalized_norm_beta: %s" % (column_names, norm_beta)) # beta = submodels1['beta'] # print "beta:", beta # if len(column_names)!=len(beta): # print len(column_names), len(beta) # raise Exception("column_names and beta from h2o json not same length. column_names: %s beta: %s" % (column_names, beta)) # test wants to use normalized? if doNormalized: beta_used = norm_beta else: beta_used = beta coefficients = {} # create a dictionary with name, beta (including intercept) just like v1 for i, b in zip(idxs, beta_used[:-1]): name = coefficients_names[i] coefficients[name] = b print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used) print "coefficients:", coefficients print "beta:", beta print "norm_beta:", norm_beta coefficients['Intercept'] = beta_used[-1] print "intercept demapping info:", \ "column_names[-i]:", column_names[-1], \ "idxs[-1]:", idxs[-1], \ "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \ "beta_used[-1]:", beta_used[-1], \ "coefficients['Intercept']", coefficients['Intercept'] # last one is intercept interceptName = coefficients_names[idxs[-1]] if interceptName != "Intercept" or abs(beta_used[-1]) < 1e-26: raise Exception("'Intercept' should be last in coefficient_names and beta %s %s %s" %\ (idxs[-1], beta_used[-1], "-"+interceptName+"-")) # idxs has the order for non-zero coefficients, it's shorter than beta_used and column_names # new 5/28/14. glm can point to zero coefficients # for i in idxs: # if beta_used[i]==0.0: ## raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i])) if len(idxs) > len(beta_used): raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used))) intercept = coefficients.pop('Intercept', None) # intercept demapping info: idxs[-1]: 54 coefficient_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099 # the last one shoudl be 'Intercept' ? column_names.pop() else: if doNormalized: coefficients = GLMModel['normalized_coefficients'].copy() else: coefficients = GLMModel['coefficients'].copy() column_names = GLMModel['column_names'] # get the intercept out of there into it's own dictionary intercept = coefficients.pop('Intercept', None) print "First intercept:", intercept # have to skip the output col! get it from kwargs # better always be there! if h2o.beta_features: y = kwargs['response'] else: y = kwargs['y'] # the dict keys are column headers if they exist...how to order those? new: use the 'column_names' # from the response # Tomas created 'column_names which is the coefficient list in order. # Just use it to index coefficients! works for header or no-header cases # I guess now we won't print the "None" cases for dropped columns (constant columns!) # Because Tomas doesn't get everything in 'column_names' if dropped by GLMQuery before # he gets it? def add_to_coefficient_list_and_string(c, cList, cString): if c in coefficients: cValue = coefficients[c] cValueString = "%s: %.5e " % (c, cValue) else: print "Warning: didn't see '" + c + "' in json coefficient response.",\ "Inserting 'None' with assumption it was dropped due to constant column)" cValue = None cValueString = "%s: %s " % (c, cValue) cList.append(cValue) # we put each on newline for easy comparison to R..otherwise keep condensed if prettyPrint: cValueString = "H2O coefficient " + cValueString + "\n" # not mutable? return cString + cValueString # creating both a string for printing and a list of values cString = "" cList = [] # print in order using col_names # column_names is input only now..same for header or no header, or expanded enums for c in column_names: cString = add_to_coefficient_list_and_string(c, cList, cString) if prettyPrint: print "\nH2O intercept:\t\t%.5e" % intercept print cString else: if not noPrint: print "\nintercept:", intercept, cString print "\nTotal # of coefficients:", len(column_names) # pick out the coefficent for the column we enabled for enhanced checking. Can be None. # FIX! temporary hack to deal with disappearing/renaming columns in GLM if (not allowZeroCoeff) and (colX is not None): absXCoeff = abs(float(coefficients[str(colX)])) self.assertGreater( absXCoeff, 1e-26, ("abs. value of GLM coefficients['" + str(colX) + "'] is " + str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX))) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ("abs. value of GLM coefficients['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept")) # this is good if we just want min or max # maxCoeff = max(coefficients, key=coefficients.get) # for more, just invert the dictionary and ... if (len(coefficients) > 0): maxKey = max([(abs(coefficients[x]), x) for x in coefficients])[1] print "H2O Largest abs. coefficient value:", maxKey, coefficients[ maxKey] minKey = min([(abs(coefficients[x]), x) for x in coefficients])[1] print "H2O Smallest abs. coefficient value:", minKey, coefficients[ minKey] else: print "Warning, no coefficients returned. Must be intercept only?" # many of the GLM tests aren't single column though. # quick and dirty check: if all the coefficients are zero, # something is broken # intercept is in there too, but this will get it okay # just sum the abs value up..look for greater than 0 # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff? if (not allowZeroCoeff) and (len(coefficients) > 1): s = 0.0 for c in coefficients: v = coefficients[c] s += abs(float(v)) self.assertGreater( s, 1e-26, ("sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26")) if h2o.beta_features: print "submodels1, run_time (milliseconds):", submodels1['run_time'] else: print "GLMModel model time (milliseconds):", GLMModel['model_time'] print "GLMModel validation time (milliseconds):", validations[ 'val_time'] print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time'] # shouldn't have any errors h2o.check_sandbox_for_errors() return (warnings, cList, intercept)
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest if h2o.beta_features: kwargs = {'str': execExpr, 'timeoutSecs': 15} else: kwargs = {'expression': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. if h2o.beta_features: paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model' } else: paramDict = drf1ParamDict params = { 'ntree': 20, 'out_of_bag_error_estimate': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() if h2o.beta_features: timeoutSecs = 30 + kwargs['ntrees'] * 60 else: timeoutSecs = 30 + kwargs['ntree'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" if h2o.beta_features: model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] else: model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C54', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GLM2_mnist_short(self): h2o.beta_features = True importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" # first col is pixel value ..use 0 here y = 0 ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, # first column is pixel value 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=trainKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='A.hex', cols=0, max_ncols=1, noPrint=False) execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=testKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='B.hex', cols=0, max_ncols=1, noPrint=False) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm)
def test_NN_airlines_small(self): #h2b.browseTheCloud() csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'airlines_train.hex' validation_key = 'airlines_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # this gives the last col number, which is IsDepDelayed_REC (1 or -1) # response = inspect['numCols'] - 1 # this is "YES"/"NO" response = 'IsDepDelayed' #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # get the column names colNames = [c['name'] for c in inspect['cols']] print "colNames:", colNames usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance") ignoredCols = [] for c in colNames: # don't put the response in the ignore list (is there a problem if so?) if c not in usedCols and c != response: ignoredCols.append(c) ignoredColsString = ",".join(ignoredCols) print "Telling h2o to ignore these cols:" print ignoredColsString kwargs = { 'ignored_cols': ignoredColsString, 'response': response, 'classification': 1, 'destination_key': model_key, } expectedErr = 0.45 ## expected validation error for the above model relTol = 0.50 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = {'ntrees': 20, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual( classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_NN_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'adaptive_rate' : 0, 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 2.0, 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 } expectedErr = 0.057 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr))
def test_GLM2_mnist_reals(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x GLM will use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x modelKey = "mnist" params = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey } # for c in [0,1,2,3,4,5,6,7,8,9]: # just do a couple digits for c in [0,7]: print "Trying binomial with case:", c execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) kwargs = params.copy() timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) # Score ********************************************** execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "Problems with test data having different enums than train? just use train for now" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key="B.hex", model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual="B.hex", vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_covtype_train_test(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) ntrees = 2 # fails with 40 for max_depth in [40, 5]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_NN2_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 90 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 20.0, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.046 relTol = 0.1 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GBM_with_cancels(self): print "Sets h2o.beta_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath=='manyfiles-nflx-gz': if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]: if i not in x: print "x:", x print 'missing?', i x.remove(i) xIgnore.append(i) x = ",".join(map(str,x)) def colIt(x): return "C" + str(x) xIgnore = ",".join(map(colIt, xIgnore)) else: # leave one col ignored, just to see? xIgnore = 0 modelKey = "GBMGood" params = { 'destination_key': modelKey, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(20): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): kwargs['destination_key'] = 'GBMBad' + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) jobids.append(GBMFirstResult['job_key']) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs']) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_GLM2_mnist(self): if not SCIPY_INSTALLED: pass else: SYNDATASETS_DIR = h2o.make_syn_dir() csvFilelist = [ (10000, 500, 'cA', 60), ] trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvFilename = 'syn_' + "binary" + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, rowCount, colCount) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** modelKey = 'GLM_model' y = colCount kwargs = { 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0, 'max_iter': 15, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } # GLM wants the output col to be strictly 0,1 integer execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % ( hex_key, y + 1, y + 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'aHack'} timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? lambdaMax = glm['glm_model']['lambda_max'] print "lambdaMax:", lambdaMax best_threshold = glm['glm_model']['submodels'][0][ 'validation']['best_threshold'] print "best_threshold", best_threshold # pick the middle one? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][ '_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above print "\nPredict\n==========\n" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='aHack', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='aHack', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 50, "Should see less than 50% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: tryList = [ (100000, 400, 'cA', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (100000, 100, 'cD', 300), (100000, 200, 'cE', 300), (100000, 500, 'cG', 300), (100000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] h2o.beta_features = False modelKey = 'GBMModelKey' # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" # l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( # len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) l = '{:d} jvms, {:d}MB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] h2o_cmd.infoFromInspect(inspect, csvPathname) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True # was failing with 100 trees # ntrees = 100 # for max_depth in [5,10,20,40]: ntrees = 10 for max_depth in [5]: params = { 'learn_rate': .2, 'nbins': 10, # 1024 fail 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': num_cols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_params_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect( key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': 'C1,C2,C3,C4,C5', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c9_GLM_airlines_fvec(self): h2o.beta_features = True files = [('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 30, 'n_folds': 3, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, 'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if h2o.beta_features: modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GBM_poker_1m(self): for trial in range(2): # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' timeoutSecs = 900 # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = 'poker/poker-hand-testing.data' hex_key = 'poker-hand-testing.data.hex' parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 2 for max_depth in [5,10,20]: params = { 'learn_rate': .1, 'nbins': 10, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': num_cols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_mnist(self): h2o.beta_features = True if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y), # 'case_mode': '=', # 'case_val': 0, 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, ## 'weight': 1.0, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0,1,2,3,4,5,6,7,8,9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_selfKey'] # This seems wrong..what's the format of the cm? if 1==0: cm = glm['glm_model']['submodels'][0]['validation']['_cms'][0]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM2_enums_score_superset(self): h2o.beta_features = True print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' kwargs = { 'destination_key': modelKey, 'response': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'family': 'binomial' } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual=scoreDataKey, predict='Predict.hex', vactual=y, vpredict=1) auc = resultAUC['AUC'] self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=scoreDataKey, predict=predictKey, vactual='C' + str(y + 1), vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM2_covtype_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y+1), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', } timeoutSecs = 180 for trial in range(10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs = 1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** modelKey = "GBM_model" params = { 'classification': 1, # faster? 'destination_key': modelKey, 'learn_rate': .1, 'ntrees': 3, 'max_depth': 8, 'min_rows': 1, 'response': 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 #noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cms = gbmTrainView['gbm_model']['cms'] cm = cms[-1]['_arr'] # use the last one print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr'] print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs'])
def test_GBM_poker_1m(self): h2o.beta_features = True for trial in range(2): # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' timeoutSecs = 900 # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = 'poker/poker-hand-testing.data' hex_key = 'poker-hand-testing.data.hex' parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 2 for max_depth in [5,10,20]: params = { 'learn_rate': .1, 'nbins': 10, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': numCols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_DeepLearning_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '1024,1024,2048', 'adaptive_rate' : 1, 'rho' : 0.99, 'epsilon' : 1e-8, 'train_samples_per_iteration' : -1, ## 0: better accuracy! -1: best scalability! 10000: best accuracy? # 'rate' : 0.01, # 'rate_annealing' : 1e-6, # 'momentum_start' : 0.5, # 'momentum_ramp' : 1800000, # 'momentum_stable' : 0.99, 'l1' : 1e-5, 'l2' : 0.0, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', 'epochs' : 128, #enough for 64 nodes 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 #don't score until the end } timeoutSecs = 7200 start = time.time() deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 # response = 378 response = 'C379' # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_DeepLearning_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A', 'B', 0, 14), ('A', 'B', 1, 14), (0, 1, 0, 12), (0, 1, 1, 12), (0, 1, 'NaN', 12), (1, 0, 'NaN', 12), (-1, 1, 0, 12), (-1, 1, 1, 12), (-1e1, 1e1, 1e1, 12), (-1e1, 1e1, -1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' validation_key = hex_key parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols': None, 'response': 'C' + str(response), 'classification': 1, 'activation': 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden': '500', 'rate': 0.01, 'rate_annealing': 1e-6, 'momentum_start': 0, 'momentum_stable': 0, 'l1': 0.0, 'l2': 1e-4, 'seed': 80023842348, 'loss': 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 1.0, 'destination_key': model_key, 'validation': hex_key, } timeoutSecs = 60 start = time.time() h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.001 relTol = 0.01 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr)) trial += 1