def delete_keys_at_all_nodes(node=None, pattern=None, timeoutSecs=120): print "Frame is too slow to look up key names when a lot of unparsed files were imported" print "Just using remove_all_keys and saying 0 removed" print "WARNING: pattern is ignored" if 1==1: h2o.n0.remove_all_keys() return 0 else: print "Going to delete all keys one at a time (slower than 'remove all keys')" # TEMP: change this to remove_all_keys which ignores locking and removes keys? # getting problems when tests fail in multi-test-on-one-h2o-cluster runner*sh tests if not node: node = h2o_nodes.nodes[0] print "Will cancel any running jobs, because we can't unlock keys on running jobs" # I suppose if we used a pattern, we wouldn't have to worry about running jobs..oh well. h2o_jobs.cancelAllJobs() print "unlock all keys first to make sure broken keys get removed" node.unlock() totalDeletedCnt = 0 deletedCnt = delete_keys(node, pattern=pattern, timeoutSecs=timeoutSecs) totalDeletedCnt += deletedCnt if pattern: print "Total: Deleted", totalDeletedCnt, "keys with filter=", pattern, "at", len(h2o_nodes.nodes), "nodes" else: print "Total: Deleted", totalDeletedCnt, "keys at", len(h2o_nodes.nodes), "nodes" # do a remove_all_keys to clean out any locked keys also (locked keys will complain above) # doesn't work if you remove job keys first, since it looks at the job list and gets confused ### node.remove_all_keys(timeoutSecs=timeoutSecs) return totalDeletedCnt
def delete_keys_at_all_nodes(node=None, pattern=None, timeoutSecs=120): print "Going to delete all keys one at a time (slower than 'remove all keys')" # TEMP: change this to remove_all_keys which ignores locking and removes keys? # getting problems when tests fail in multi-test-on-one-h2o-cluster runner*sh tests if not node: node = h2o_nodes.nodes[0] print "Will cancel any running jobs, because we can't unlock keys on running jobs" # I suppose if we used a pattern, we wouldn't have to worry about running jobs..oh well. h2o_jobs.cancelAllJobs() print "unlock all keys first to make sure broken keys get removed" node.unlock() totalDeletedCnt = 0 deletedCnt = delete_keys(node, pattern=pattern, timeoutSecs=timeoutSecs) totalDeletedCnt += deletedCnt if pattern: print "Total: Deleted", totalDeletedCnt, "keys with filter=", pattern, "at", len( h2o_nodes.nodes), "nodes" else: print "Total: Deleted", totalDeletedCnt, "keys at", len( h2o_nodes.nodes), "nodes" # do a remove_all_keys to clean out any locked keys also (locked keys will complain above) # doesn't work if you remove job keys first, since it looks at the job list and gets confused ### node.remove_all_keys(timeoutSecs=timeoutSecs) return totalDeletedCnt
def test_1ktrees_job_cancel_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "Kick off twenty, then cancel them all..there's a timeout on the wait after cancelling" for trial in range (1,20): h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, depth=50, rfView=False, noPoll=True, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2o_jobs.cancelAllJobs(timeoutSecs=10)
def delete_keys_at_all_nodes(node=None, pattern=None, timeoutSecs=120): print "Going to delete all keys one at a time (slower than 'remove all keys')" # TEMP: change this to remove_all_keys which ignores locking and removes keys? # getting problems when tests fail in multi-test-on-one-h2o-cluster runner*sh tests if not node: node = h2o.nodes[0] print "Will cancel any running jobs, because we can't unlock keys on running jobs" # I suppose if we used a pattern, we wouldn't have to worry about running jobs..oh well. h2o_jobs.cancelAllJobs() print "unlock all keys first to make sure broken keys get removed" node.unlock() totalDeletedCnt = 0 # do it in reverse order, since we always talk to 0 for other stuff # this will be interesting if the others don't have a complete set # theoretically, the deletes should be 0 after the first node # since the deletes should be global # for node in reversed(h2o.nodes): # new: only use the directed node (node[0] typically) # h2o storeview should have a global view now. deletedCnt = delete_keys(node, pattern=pattern, timeoutSecs=timeoutSecs) totalDeletedCnt += deletedCnt if pattern: print "Total: Deleted", totalDeletedCnt, "keys with filter=", pattern, "at", len(h2o.nodes), "nodes" else: print "Total: Deleted", totalDeletedCnt, "keys at", len(h2o.nodes), "nodes" # do a remove_all_keys to clean out any locked keys also (locked keys will complain above) # doesn't work if you remove job keys first, since it looks at the job list and gets confused ### node.remove_all_keys(timeoutSecs=timeoutSecs) return totalDeletedCnt
def test_GBM_mnist_restart_many(self): importFolderPath = "mnist" csvFilename = "train.csv.gz" timeoutSecs = 1800 trialStart = time.time() for trial in range(10): # PARSE train**************************************** trainKey = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 10, 'max_depth': 8, 'min_rows': 1, 'response': 784, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o.beta_features = False # if it fails, should happen within 8 secs time.sleep(8) h2j.cancelAllJobs() h2o.check_sandbox_for_errors() print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial if DO_DELETE_KEYS_AND_CAUSE_PROBLEM: h2i.delete_keys_at_all_nodes()
def test_GBM_mnist_restart_many(self): importFolderPath = "mnist" csvFilename = "train.csv.gz" timeoutSecs=1800 trialStart = time.time() for trial in range(10): # PARSE train**************************************** trainKey = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 10, 'max_depth': 8, 'min_rows': 1, 'response': 784, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o.beta_features = False # if it fails, should happen within 8 secs time.sleep(8) h2j.cancelAllJobs() h2o.check_sandbox_for_errors() print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial if DO_DELETE_KEYS_AND_CAUSE_PROBLEM: h2i.delete_keys_at_all_nodes()
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i == (TRIES - 1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, retryDelaySecs=5, doSimpleCheck=False) print "rfView:", h2o.dump_json(rfView) rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual( len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict" ) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_cancel_model_reuse(self): h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: # Only integer or enum/factor columns can be classified if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) ignoreIndex = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response] # have to add 1 for col start with 1, now. plus the C xIgnore = ",".join(["C" + str(i+1) for i in ignoreIndex]) params = { 'destination_key': None, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response+1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 for i in range(5): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(j) # rjson error in poll_url: Job was cancelled by user! GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) h2o.check_sandbox_for_errors() # have to pass the job id # for j in jobids: # h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.cancelAllJobs() # PUB-361. going to wait after cancel before reusing keys time.sleep(3) # am I getting a subsequent parse job cancelled? h2o_jobs.showAllJobs() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i==(TRIES-1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) print "rfView:", h2o.dump_json(rfView) rfView["drf_model"] = rfView.pop("speedrf_model") rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict") # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)