def test_GBM_regression_rand2(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C54', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = {'response': 'C54', 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0} h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # for regression, the cms are all null, so don't print # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_GBM_regression_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = { 'response': 'C55', # 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0, 'validation': testKey, } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) print "gbmTrainView:", h2o.dump_json(gbmTrainView) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # for regression, the cms are all null, so don't print # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict(data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_GBM_params_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect( key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': 'C1,C2,C3,C4,C5', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_params_rand2(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': '0,1,2,3,4', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cms'][-1] # use the last one # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_params_rand2(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set print "Regression!" params = {'response': 54, 'ignored_cols_by_name': '5,6,7,8,9', 'ntrees': 2, 'classification': 0} h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?" h2o.beta_features = False