def test_GBMGrid_basic_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = [ 'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL', 'GLEASON' ] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '1:3:1', 'max_depth': '8,9', 'min_rows': '1:5:2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 15)
def test_GBMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '1:3:1', 'max_depth': '8,9', 'min_rows': '1:5:2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 15)
def test_GBMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] modelKey = 'GBMGrid_benign' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'STR', 'learn_rate': '.1,.2,.25', 'ntrees': '3:5:1', 'max_depth': '5,7', 'min_rows': '1,2', 'response': 'FNDX', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 0)
def test_GBMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") colNames = ["ID", "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] modelKey = "GBMGrid_prostate" # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { "destination_key": modelKey, "ignored_cols_by_name": "ID", "learn_rate": ".1,.2", "ntrees": "1:3:1", "max_depth": "8,9", "min_rows": "1:5:2", "response": "CAPSULE", "classification": 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 10)
def test_GBMGrid_basic_many(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '8,10', 'max_depth': '8,9', 'min_rows': '1,2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 1, } kwargs = params.copy() timeoutSecs = 1800 jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 # for more in range(8): # fast # for more in range(9): for i in range(5): kwargs = params.copy() kwargs['min_rows'] = '1,2,3' if DO_FROM_TO_STEP: kwargs['max_depth'] = '5:10:1' else: kwargs['max_depth'] = '5,6,10' GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # print "GBMResult:", h2o.dump_json(GBMResult) job_key = GBMResult['job_key'] model_key = GBMResult['destination_key'] jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key) h2o_gbm.showGBMGridResults(GBMResult, 15) print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs
def test_GBMGrid_basic_many(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '8,10', 'max_depth': '8,9', 'min_rows': '1,2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 1, } kwargs = params.copy() timeoutSecs = 1800 jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 # for more in range(8): # fast # for more in range(9): for i in range(50 if DO_FAIL_CASE else 10): kwargs = params.copy() kwargs['min_rows'] = '1,2,3' if DO_FROM_TO_STEP: kwargs['max_depth'] = '5:10:1' else: kwargs['max_depth'] = '5,6,10' GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # print "GBMResult:", h2o.dump_json(GBMResult) job_key = GBMResult['job_key'] model_key = GBMResult['destination_key'] jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key) h2o_gbm.showGBMGridResults(GBMResult, 15) print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs
def test_GBMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") # columns start at 0 # cols 0-13. 3 is output # no member id in this one # check the first in the models list. It should be the best colNames = [ "STR", "OBS", "AGMT", "FNDX", "HIGD", "DEG", "CHK", "AGP1", "AGMN", "NLV", "LIV", "WT", "AGLP", "MST", ] modelKey = "GBMGrid_benign" # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { "destination_key": modelKey, "ignored_cols_by_name": "STR", "learn_rate": ".1,.2,.25", "ntrees": "3:5:1", "max_depth": "5,7", "min_rows": "1,2", "response": "FNDX", "classification": 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 0)