def exec_expr(node, execExpr, resultKey="Result.hex", timeoutSecs=10, ignoreH2oError=False): start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? # 5/14/13 removed escape_nan=0 resultExec = h2o_cmd.runExecOnly(node, expression=execExpr, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError) h2o.verboseprint(resultExec) h2o.verboseprint('exec took', time.time() - start, 'seconds') ### print 'exec took', time.time() - start, 'seconds' h2o.verboseprint("\nfirst look at the default Result key") # new offset=-1 to get the metadata? defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1) checkScalarResult(defaultInspectM1, "Result.hex") h2o.verboseprint("\nNow look at the assigned " + resultKey + " key") resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1) min_value = checkScalarResult(resultInspectM1, resultKey) return resultInspectM1, min_value
def exec_expr(node, execExpr, resultKey="Result.hex", timeoutSecs=10): start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? resultExec = h2o_cmd.runExecOnly(node, expression=execExpr, timeoutSecs=timeoutSecs, escape_nan=0) h2o.verboseprint(resultExec) h2o.verboseprint('exec took', time.time() - start, 'seconds') ### print 'exec took', time.time() - start, 'seconds' h2o.verboseprint("\nfirst look at the default Result key") # new offset=-1 to get the metadata? defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1) checkScalarResult(defaultInspectM1, "Result.hex") h2o.verboseprint("\nNow look at the assigned " + resultKey + " key") resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1) min_value = checkScalarResult(resultInspectM1, resultKey) return resultInspectM1, min_value
def test_loop_random_exec_covtype(self): lenNodes = len(h2o.nodes) csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') key2 = 'c.hex' parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', key2, 10) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() # for trial in range(53): trial = 0 while (trial < 100): for exprTemplate in exprList: trial = trial + 1 n = trial colX = random.randint(1, 54) row = random.randint(1, 400000) execExpr = exprTemplate execExpr = re.sub('<col1>', str(colX), execExpr) execExpr = re.sub('<col2>', str(colX + 1), execExpr) execExpr = re.sub('<n>', str(n), execExpr) execExpr = re.sub('<row>', str(row), execExpr) execExpr = re.sub('<keyX>', str(key2), execExpr) # pick a random node to execute it on randNode = random.randint(0, lenNodes - 1) print "\nexecExpr:", execExpr, "on node", randNode start = time.time() resultExec = h2o_cmd.runExecOnly(node=h2o.nodes[randNode], expression=execExpr, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(resultExec)) # print(h2o.dump_json(resultExec)) # FIX! race conditions. If json is done, does that mean you can inspect it?? # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist if trial > 1: inspectMe = random.choice(inspectList) resultInspect = h2o.nodes[0].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[1].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[2].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) # FIX! if we race the browser doing the exec too..it shouldn't be a problem? # might be a bug? # WARNING! we can't browse the Exec url history, since that will # cause the Exec to execute again thru the browser..i.e. it has side effects # just look at the last inspect, which should be the resultInspect! # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2b.browseJsonHistoryAsUrlLastMatch("Exec") # url = "http://192.168.0.37:54321/Exec?Expr=Result3+%3D+c.hex%5B26%5D+%2B+Result1&Key=Result" # webbrowser.open_new_tab(url) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_rf_covtype20x(self): importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTrain = 'covtype20x.data' key2 = 'covtype20x.data.A.hex' parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, key2=key2, timeoutSecs=500) print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time'] inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key']) dataKeyTrain = parseKeyTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTest = 'covtype20x.data' key2 = 'covtype20x.data.B.hex' parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, key2=key2, timeoutSecs=500) print csvFilenameTest, 'parse time:', parseKeyTest['response']['time'] print "Parse result['destination_key']:", parseKeyTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key']) dataKeyTest = parseKeyTest['destination_key'] dataKeyTest2 = 'covtype20x.data.C.hex' print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest resultExec = h2o_cmd.runExecOnly(expression=execExpr, timeoutSecs=15) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?" params = { 'ntree': 6, 'parallel': 1, 'out_of_bag_error_estimate': 0, 'no_confusion_matrix': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5) start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=500, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_loop_random_exec_covtype(self): lenNodes = len(h2o.nodes) csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') key2 = 'c.hex' parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', key2, 10) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() # for trial in range(53): trial = 0 while (trial < 100): for exprTemplate in exprList: trial = trial + 1 n = trial colX = random.randint(1,54) row = random.randint(1,400000) execExpr = exprTemplate execExpr = re.sub('<col1>',str(colX),execExpr) execExpr = re.sub('<col2>',str(colX+1),execExpr) execExpr = re.sub('<n>',str(n),execExpr) execExpr = re.sub('<row>',str(row),execExpr) execExpr = re.sub('<keyX>',str(key2),execExpr) # pick a random node to execute it on randNode = random.randint(0,lenNodes-1) print "\nexecExpr:", execExpr, "on node", randNode start = time.time() resultExec = h2o_cmd.runExecOnly(node=h2o.nodes[randNode], expression=execExpr, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(resultExec)) # print(h2o.dump_json(resultExec)) # FIX! race conditions. If json is done, does that mean you can inspect it?? # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist if trial > 1: inspectMe = random.choice(inspectList) resultInspect = h2o.nodes[0].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[1].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[2].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) # FIX! if we race the browser doing the exec too..it shouldn't be a problem? # might be a bug? # WARNING! we can't browse the Exec url history, since that will # cause the Exec to execute again thru the browser..i.e. it has side effects # just look at the last inspect, which should be the resultInspect! # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2b.browseJsonHistoryAsUrlLastMatch("Exec") # url = "http://192.168.0.37:54321/Exec?Expr=Result3+%3D+c.hex%5B26%5D+%2B+Result1&Key=Result" # webbrowser.open_new_tab(url) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"