def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False): if not node: node = h2o.nodes[0] start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? # 5/14/13 removed escape_nan=0 kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) h2o.verboseprint('exec took', time.time() - start, 'seconds') h2o.verboseprint(resultExec) # inspect a result key? if resultKey is not None: kwargs = {'str': resultKey} resultExec2 = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) h2o.verboseprint("resultExec2:", h2o.dump_json(resultExec2)) # maybe return 'scalar' in some cases? return resultExec2, resultExec2['cols'][0]['min'] else: if 'scalar' in resultExec: result = resultExec['scalar'] elif 'result' in resultExec: result = resultExec['result'] else: result = None return resultExec, result
def test_GLM_covtype(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" x = "" print "Touching it with exec to trigger va to fvec (covtype.hex) , and then fvec to va (covtype2.hex)" h2o_cmd.runExec(str='%s=%s' % ('covtype2.hex', hex_key)) # hack to use the new one parseResult['destination_key'] = 'covtype2.hex' # L2 kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3} timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs)
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False): if not node: node = h2o_nodes.nodes[0] kwargs = {'ast': execExpr} start = time.time() resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) verboseprint('exec took', time.time() - start, 'seconds') print "exec:", dump_json(resultExec) # when do I get cols? # "result": "1.0351050710011848E-300", # "scalar": 1.0351050710011848e-300, # "funstr": null, # "key": null, # "col_names": null, # "num_cols": 0, # "num_rows": 0, # "exception": null, # echoing? # "string": null # "funs": null, # "ast": "(= !x (xorsum ([ $r1 \"null\" #0) $TRUE))", if 'cols' in resultExec and resultExec['cols']: # not null if 'funstr' in resultExec and resultExec['funstr']: # not null raise Exception("cols and funstr shouldn't both be in resultExec: %s" % dump_json(resultExec)) else: print "Frame return" # if test said to look at a resultKey, it's should be in h2o k/v store # inspect a result key? # Should we get the key name from the exec return? if resultKey is not None: kwargs = {'ast': resultKey} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) print "exec key result:", dump_json(resultExec) # handles the 1x1 data frame result. Not really interesting if bigger than 1x1? result = resultExec['cols'][0]['min'] else: if 'funstr' in resultExec and resultExec['funstr']: # not null print "function return" result = resultExec['funstr'] else: print "scalar return" result = resultExec['scalar'] return resultExec, result
def test_GLM2_covtype_exec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" h2o_cmd.runExec(str='%s[,55] = %s[,55]==1' % (hex_key, hex_key)) # L2 kwargs = { 'response': y, 'family': 'binomial', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3} timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs)
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False, **kwargs): if not node: node = h2o_nodes.nodes[0] start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? # 5/14/13 removed escape_nan=0 kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) verboseprint('exec took', time.time() - start, 'seconds') verboseprint(resultExec) if 'cols' in resultExec and resultExec['cols']: # not null if 'funstr' in resultExec and resultExec['funstr']: # not null raise Exception( "cols and funstr shouldn't both be in resultExec: %s" % dump_json(resultExec)) else: print "Frame return" # if test said to look at a resultKey, it's should be in h2o k/v store # inspect a result key? if resultKey is not None: kwargs = {'str': resultKey} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) verboseprint("resultExec2:", dump_json(resultExec)) # handles the 1x1 data frame result. Not really interesting if bigger than 1x1? result = resultExec['cols'][0]['min'] else: if 'funstr' in resultExec and resultExec['funstr']: # not null print "function return" result = resultExec['funstr'] else: ### print "scalar return" result = resultExec['scalar'] return resultExec, result
def test_exec2_frame_fail(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] start = time.time() execExpr = 'Result2=c.hex[,9]' resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) h2o.check_sandbox_for_errors() execExpr = 'Result2[,1]=(c.hex[,2]==0) ? 54321 : 54321' resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_exec2_frame_fail(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=15) print "\nParse key is:", parseResult['destination_key'] start = time.time() execExpr = 'Result2=c.hex[,9]' resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) h2o.check_sandbox_for_errors() execExpr = 'Result2[,1]=(c.hex[,2]==0) ? 54321 : 54321' resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_json_browse_both_exec(self): lenNodes = len(h2o.nodes) csvPathname = 'standard/covtype.data' hex_key = 'c.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "\nParse key is:", parseResult['destination_key'] ## h2b.browseTheCloud() # for trial in range(53): trial = 0 while (trial < 100): for exprTemplate in exprList: trial = trial + 1 n = trial colX = random.randint(1,54) row = random.randint(1,400000) execExpr = exprTemplate execExpr = re.sub('<col1>',str(colX),execExpr) execExpr = re.sub('<col2>',str(colX+1),execExpr) execExpr = re.sub('<n>',str(n),execExpr) execExpr = re.sub('<row>',str(row),execExpr) execExpr = re.sub('<keyX>',str(hex_key),execExpr) # pick a random node to execute it on randNode = random.randint(0,lenNodes-1) print "\nexecExpr:", execExpr, "on node", randNode start = time.time() resultExec = h2o_cmd.runExec(node=h2o.nodes[randNode], execExpr=execExpr, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(resultExec)) # print(h2o.dump_json(resultExec)) # FIX! race conditions. If json is done, does that mean you can inspect it?? # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist if trial > 1: inspectMe = random.choice(inspectList) resultInspect = h2o.nodes[0].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[1].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[2].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) # FIX! if we race the browser doing the exec too..it shouldn't be a problem? # might be a bug? # WARNING! we can't browse the Exec url history, since that will # cause the Exec to execute again thru the browser..i.e. it has side effects # just look at the last inspect, which should be the resultInspect! # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2b.browseJsonHistoryAsUrlLastMatch("Exec") h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_exec2_frame_fail(self): h2o.beta_features = True csvPathname = "standard/covtype.data" parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key="c.hex", timeoutSecs=15 ) print "\nParse key is:", parseResult["destination_key"] start = time.time() execExpr = "Result2=c.hex[,9]" resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) h2o.check_sandbox_for_errors() execExpr = "Result2[,1]=(c.hex[,2]==0) ? 54321 : 54321" resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", "took", time.time() - start, "seconds"
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False): if not node: node = h2o.nodes[0] start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? # 5/14/13 removed escape_nan=0 kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) h2o.verboseprint('exec took', time.time() - start, 'seconds') h2o.verboseprint(resultExec) if 'cols' in resultExec and resultExec['cols']: # not null if 'funstr' in resultExec and resultExec['funstr']: # not null raise Exception("cols and funstr shouldn't both be in resultExec: %s" % h2o.dump_json(resultExec)) else: # Frame # if test said to look at a resultKey, it's should be in h2o k/v store # inspect a result key? if resultKey is not None: kwargs = {'str': resultKey} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) h2o.verboseprint("resultExec2:", h2o.dump_json(resultExec)) # handles the 1x1 data frame result. Not really interesting if bigger than 1x1? result = resultExec['cols'][0]['min'] else: if 'funstr' in resultExec and resultExec['funstr']: # not null # function return result = resultExec['funstr'] else: # scalar result = resultExec['scalar'] return resultExec, result
def test_rf_covtype20x(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTrain hex_key = 'covtype20x.data.A.hex' parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print csvFilenameTrain, 'parse time:', parseResultTrain['response']['time'] inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest hex_key = 'covtype20x.data.B.hex' parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print csvFilenameTest, 'parse time:', parseResultTest['response']['time'] print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] dataKeyTest2 = 'covtype20x.data.C.hex' print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=15) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?" params = { 'ntree': 6, 'parallel': 1, 'out_of_bag_error_estimate': 0, # Causes rest api illegal argument error. # 'no_confusion_matrix': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5) start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=500, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = {'ntrees': 20, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual( classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GBM_manyfiles_multijob(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) # inc by 1 for R col # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate # only a problem if they share the dataset, do classification with integers. # change to factor here, to avoid the problem execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** csvPathname = importFolderPath + "/" + testFilename parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! # plus 1 for R indexing execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] # add 1 for start-with-1 ignored_cols_by_name = ",".join( map(lambda x: "C" + str(x + 1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % 'C' + str( response + 1) ntrees = 10 trial = 0 # ignore 200 random cols (not the response) print "Kicking off multiple GBM jobs at once" # GBM train**************************************** if DO_FAIL: cases = [5, 10, 20, 40] else: cases = [5, 10, 20] for max_depth in cases: trial += 1 params = { 'response': "C" + str(response + 1), 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'validation': parseTestResult['destination_key'], 'ignored_cols_by_name': ignored_cols_by_name, 'grid_parallelism': 1, 'classification': 1 if DO_CLASSIFICATION else 0, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() # can take 4 times as long with 4 jobs? gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs) trainElapsed = time.time() - trainStart print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest if h2o.beta_features: kwargs = {'str': execExpr, 'timeoutSecs': 15} else: kwargs = {'expression': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. if h2o.beta_features: paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model' } else: paramDict = drf1ParamDict params = { 'ntree': 20, 'out_of_bag_error_estimate': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() if h2o.beta_features: timeoutSecs = 30 + kwargs['ntrees'] * 60 else: timeoutSecs = 30 + kwargs['ntree'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" if h2o.beta_features: model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] else: model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C54', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GBM_cancel_model_reuse(self): h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: # Only integer or enum/factor columns can be classified if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) ignoreIndex = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response] # have to add 1 for col start with 1, now. plus the C xIgnore = ",".join(["C" + str(i+1) for i in ignoreIndex]) params = { 'destination_key': None, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response+1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 for i in range(5): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(j) # rjson error in poll_url: Job was cancelled by user! GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) h2o.check_sandbox_for_errors() # have to pass the job id # for j in jobids: # h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.cancelAllJobs() # PUB-361. going to wait after cancel before reusing keys time.sleep(3) # am I getting a subsequent parse job cancelled? h2o_jobs.showAllJobs() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False, doFuns=False): if not node: if len(h2o_nodes.nodes) == 0: raise Exception("You appeared to have not h2o.init() a h2o cloud? nodes is empty." + \ "You may be misusing xl/rapids objects so they try to talk to h2o, before you have a cloud built." + \ "Check if you're using .do() or Assign() with default do==True h2o_nodes.nodes: %s" % h2o_nodes.nodes) node = h2o_nodes.nodes[0] if doFuns: kwargs = {'funs': execExpr} else: kwargs = {'ast': execExpr} start = time.time() if resultKey is not None: # doesn't like no key node.rapids_iseval(ast_key=resultKey) resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) verboseprint('exec took', time.time() - start, 'seconds') # print "exec:", dump_json(resultExec) shortenIt = resultExec if 'head' in shortenIt: shortenIt['head'] = 'chopped out by python exec_expr for print brevity' print "exec:", dump_json(shortenIt) # when do I get cols? # "result": "1.0351050710011848E-300", # "scalar": 1.0351050710011848e-300, # "funstr": null, # "key": null, # "col_names": null, # "num_cols": 0, # "num_rows": 0, # "exception": null, # echoing? # "string": null # "funs": null, # "ast": "(= !x (xorsum ([ $r1 \"null\" #0) $TRUE))", # can have zero rows and non-zero cols if (resultExec['num_rows'] != 0) and 'key' in resultExec and resultExec['key']: if 'name' not in resultExec['key']: raise Exception("'name' not in 'key'" % dump_json(resultExec)) resultKey = resultExec['key']['name'] if 'funstr' in resultExec and resultExec['funstr']: # not null raise Exception( "cols and funstr shouldn't both be in resultExec: %s" % dump_json(resultExec)) else: print "Frame return" # No longer required...can be null # if resultKey is None: # raise Exception("\nWhy is key.name null when it looks like a frame result? %s" % dump_json(resultExec)) if resultKey is None: pass result = None # FIX! don't look for it if it starts with "_"..spencer deletes? elif resultKey == '_': print "WARNING: key/name in result, but leading '_' means it's deleted, so can't view. %s" % resultKey result = None else: # handles the 1x1 data frame result. Not really interesting if bigger than 1x1? inspect = h2o_cmd.runInspect(key=resultKey) # print "inspect key of result:", dump_json(inspect) # zero row is possible in the inspect. But why would it have zero rows if the first resultExec didn't have rows = inspect['frames'][0]['rows'] if rows == 0: raise Exception("Inspect of resultKey %s has zero rows %s But resultExec didn't have zero rows %s" % \ (resultKey, resultExec['num_rows'], rows)) result = inspect['frames'][0]['columns'][0]['mins'][0] else: if (resultExec['num_rows'] == 0) and 'key' in resultExec and resultExec['key']: print "zero row key return" result = None elif 'funstr' in resultExec and resultExec['funstr']: # not null print "function return" result = resultExec['funstr'] else: # empty num_rows=0 will come thru here? print "scalar return" result = resultExec['scalar'] return resultExec, result
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False, doFuns=False): if not node: if len(h2o_nodes.nodes)==0: raise Exception("You appeared to have not h2o.init() a h2o cloud? nodes is empty." + \ "You may be misusing xl/rapids objects so they try to talk to h2o, before you have a cloud built." + \ "Check if you're using .do() or Assign() with default do==True h2o_nodes.nodes: %s" % h2o_nodes.nodes) node = h2o_nodes.nodes[0] if doFuns: kwargs = {'funs': execExpr} else: kwargs = {'ast': execExpr} start = time.time() if resultKey is not None: # doesn't like no key node.rapids_iseval(ast_key=resultKey) resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) verboseprint('exec took', time.time() - start, 'seconds') # print "exec:", dump_json(resultExec) shortenIt = resultExec if 'head' in shortenIt: shortenIt['head'] = 'chopped out by python exec_expr for print brevity' # print "exec:", dump_json(shortenIt) # when do I get cols? # "result": "1.0351050710011848E-300", # "scalar": 1.0351050710011848e-300, # "funstr": null, # "key": null, # "col_names": null, # "num_cols": 0, # "num_rows": 0, # "exception": null, # echoing? # "string": null # "funs": null, # "ast": "(= !x (xorsum ([ $r1 \"null\" #0) $TRUE))", # can have zero rows and non-zero cols if (resultExec['num_rows']!=0) and 'key' in resultExec and resultExec['key']: if 'name' not in resultExec['key']: raise Exception("'name' not in 'key'" % dump_json(resultExec)) resultKey = resultExec['key']['name'] if 'funstr' in resultExec and resultExec['funstr']: # not null raise Exception("cols and funstr shouldn't both be in resultExec: %s" % dump_json(resultExec)) else: print "Frame return" # No longer required...can be null # if resultKey is None: # raise Exception("\nWhy is key.name null when it looks like a frame result? %s" % dump_json(resultExec)) if resultKey is None: pass result = None # FIX! don't look for it if it starts with "_"..spencer deletes? elif resultKey=='_': print "WARNING: key/name in result, but leading '_' means it's deleted, so can't view. %s" % resultKey result = None else: # handles the 1x1 data frame result. Not really interesting if bigger than 1x1? inspect = h2o_cmd.runInspect(key=resultKey) # print "inspect key of result:", dump_json(inspect) # zero row is possible in the inspect. But why would it have zero rows if the first resultExec didn't have rows = inspect['frames'][0]['rows'] if rows==0: raise Exception("Inspect of resultKey %s has zero rows %s But resultExec didn't have zero rows %s" % \ (resultKey, resultExec['num_rows'], rows)) result = inspect['frames'][0]['columns'][0]['mins'] else: if (resultExec['num_rows']==0) and 'key' in resultExec and resultExec['key']: print "zero row key return" result = None elif 'funstr' in resultExec and resultExec['funstr']: # not null print "function return" result = resultExec['funstr'] else: # empty num_rows=0 will come thru here? print "scalar return" result = resultExec['scalar'] return resultExec, result
def test_from_import_fvec(self): print "Sets h2o.beta_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 'C378'), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath=='manyfiles-nflx-gz': if EXEC_FVEC: execExpr = 'c.hex=colSwap(c.hex,378,(c.hex[378]>15 ? 1 : 0))' resultExec = h2o_cmd.runExec(expression=execExpr) x = range(542) # don't include the output column # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [4, 3, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) xIgnore.append(i) x = ",".join(map(str,x)) def colIt(x): return "C" + str(x) xIgnore = ",".join(map(colIt, xIgnore)) else: # leave one col ignored, just to see? xIgnore = 0 params = { 'destination_key': "GBMKEY", 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': response, 'classification': 0, } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=False,**kwargs) # wait for it to show up in jobs? time.sleep(2) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (GBMResult['python_%timeout']) print "\nGBMResult:", GBMResult # print "\nGBMResult:", h2o.dump_json(GBMResult) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) sys.stdout.write('.') sys.stdout.flush()
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary( key=selKey2, max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k, v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k >= 0 and k < len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0) / rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k + 1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json( resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception( 'col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_GLM_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for hex_key csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can ### h2b.browseTheCloud() importFolderPath = "standard" for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, hex_key=hex_key) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # this will make it fvec print "Touching %s with exec to make it fvec" % hex_key h2o_cmd.runExec(str='%s[0,]=%s[0,]' % (hex_key, hex_key)) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" x = "" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 1, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3} # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors() # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors() # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors()
def test_exec2_plus_browse(self): h2o.beta_features = True lenNodes = len(h2o.nodes) csvPathname = 'standard/covtype.data' hex_key = 'c.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=20) print "\nParse key is:", parseResult['destination_key'] ## h2b.browseTheCloud() # for trial in range(53): trial = 0 while (trial < 100): for exprTemplate in exprList: trial = trial + 1 n = trial colX = random.randint(1, 54) row = random.randint(1, 400000) execExpr = exprTemplate execExpr = re.sub('<col1>', str(colX), execExpr) execExpr = re.sub('<col2>', str(colX + 1), execExpr) execExpr = re.sub('<n>', str(n), execExpr) execExpr = re.sub('<row>', str(row), execExpr) execExpr = re.sub('<keyX>', str(hex_key), execExpr) # pick a random node to execute it on randNode = random.randint(0, lenNodes - 1) print "\nexecExpr:", execExpr, "on node", randNode start = time.time() kwargs = {'str': execExpr} if RAND_EXEC_NODE: resultExec = h2o_cmd.runExec(node=h2o.nodes[randNode], timeoutSecs=15, **kwargs) else: resultExec = h2o_cmd.runExec(timeoutSecs=15, **kwargs) h2o.verboseprint(h2o.dump_json(resultExec)) # print(h2o.dump_json(resultExec)) # FIX! race conditions. If json is done, does that mean you can inspect it?? # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist if trial > 1: inspectMe = random.choice(inspectList) resultInspect = h2o.nodes[0].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[1].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[2].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) # FIX! if we race the browser doing the exec too..it shouldn't be a problem? # might be a bug? # WARNING! we can't browse the Exec url history, since that will # cause the Exec to execute again thru the browser..i.e. it has side effects # just look at the last inspect, which should be the resultInspect! h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # h2b.browseJsonHistoryAsUrlLastMatch("Exec") h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GBM_with_cancels(self): print "do import/parse with VA" h2o.beta_features = False importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False ) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult[ 'destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath == 'manyfiles-nflx-gz': if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response + 1, response + 1) kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now h2o.beta_features = True s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response ]: # have to add 1 for col start with 1, now. plus the C xIgnore.append("C" + str(i + 1)) else: # leave one col ignored, just to see? xIgnore = 'C1' modelKey = "GBMGood" params = { 'destination_key': modelKey, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response + 1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(15): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(i) + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs']) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex' ) ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): if 1==0: execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] else: (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="h")) # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="r1")) print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(num_cols) del x[response] ignored_cols_by_name = ",".join(map(str,random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 trial = 0 # ignore 200 random cols (not the response) print "Kicking off multiple GBM jobs at once" for max_depth in [5, 10, 20, 40]: trial += 1 params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'validation': parseTestResult['destination_key'], 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() # can take 4 times as long with 4 jobs? gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs) trainElapsed = time.time() - trainStart print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False, doFuns=False): if not node: if len(h2o_nodes.nodes)==0: raise Exception("You appeared to have not h2o.init() a h2o cloud? nodes is empty." + \ "You may be misusing xl/rapids objects so they try to talk to h2o, before you have a cloud built." + \ "Check if you're using .do() or Assign() with default do==True h2o_nodes.nodes: %s" % h2o_nodes.nodes) node = h2o_nodes.nodes[0] if doFuns: kwargs = {'funs': execExpr} else: kwargs = {'ast': execExpr} start = time.time() resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) verboseprint('exec took', time.time() - start, 'seconds') print "exec:", dump_json(resultExec) # when do I get cols? # "result": "1.0351050710011848E-300", # "scalar": 1.0351050710011848e-300, # "funstr": null, # "key": null, # "col_names": null, # "num_cols": 0, # "num_rows": 0, # "exception": null, # echoing? # "string": null # "funs": null, # "ast": "(= !x (xorsum ([ $r1 \"null\" #0) $TRUE))", if (resultExec['num_cols']!=0 or resultExec['num_rows']!=0) and 'key' in resultExec and resultExec['key']: if 'name' not in resultExec['key']: raise Exception("'name' not in 'key'" % dump_json(resultExec)) resultKey = resultExec['key']['name'] if 'funstr' in resultExec and resultExec['funstr']: # not null raise Exception("cols and funstr shouldn't both be in resultExec: %s" % dump_json(resultExec)) else: print "Frame return" if resultKey is None: raise Exception("\nWhy is key.name null when it looks like a frame result? %s" % dump_json(resultExec)) # if test said to look at a resultKey, it's should be in h2o k/v store # inspect a result key? # Should we get the key name from the exec return? if 1==0: kwargs = {'ast': resultKey} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) print "exec key result:", dump_json(resultExec) # FIX! don't look for it if it starts with "_"..spencer deletes? if resultKey[0]=='_': print "WARNING: key/name in result, but leading '_' means it's deleted, so can't view. %s" % resultKey result = None else: # handles the 1x1 data frame result. Not really interesting if bigger than 1x1? inspect = h2o_cmd.runInspect(key=resultKey) # print "inspect key of result:", dump_json(inspect) result = inspect['frames'][0]['columns'][0]['mins'][0] else: if 'funstr' in resultExec and resultExec['funstr']: # not null print "function return" result = resultExec['funstr'] else: print "scalar return" result = resultExec['scalar'] return resultExec, result
def test_GBM_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename # hex_key = csvFilename + "_" + str(trial) + ".hex" hex_key = "C" + str(trial) start = time.time() timeoutSecs = 2 * 3600 kwargs = {} parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # GBM **************************************** if not DO_GBM: continue # make col 2 a binomial (negative numbers in src col = 2 execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col) resultExec = h2o_cmd.runExec(str=execExpr) params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': col # should be binomial from above } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # wait for it to show up in jobs? time.sleep(2) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % ( GBMResult['python_%timeout']) print "\nGBMResult:", GBMResult # print "\nGBMResult:", h2o.dump_json(GBMResult) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_at_all_nodes()
def exec_expr(node=None, execExpr=None, resultKey="Result.hex", timeoutSecs=10, ignoreH2oError=False): if not node: node = h2o.nodes[0] start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? # 5/14/13 removed escape_nan=0 if h2o.beta_features: kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) else: kwargs = {'expression': execExpr} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) h2o.verboseprint(resultExec) h2o.verboseprint('exec took', time.time() - start, 'seconds') ### print 'exec took', time.time() - start, 'seconds' h2o.verboseprint("\nfirst look at the default Result key") # new offset=-1 to get the metadata? if h2o.beta_features: # default assign not present in v2? # constants don't create keys. # so the only way to see the results is to do another exec? kwargs = {'str': resultKey} resultExec2 = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) print "resultExec2:", h2o.dump_json(resultExec2) # maybe return 'scalar' in some cases? return resultExec2, resultExec2['cols'][0]['min'] # exec_query parameters: {'str': 'Result0 = c(0)'} # exec_query parameters: {'str': 'Result0'} # resultExec2: { # "Request2": 0, # "cols": [ # { # "max": 0.0, # "mean": 0.0, # "min": 0.0, # "naCnt": 0, # "name": "c", # "type": "Int" # } # ], # "error": null, # "funstr": null, # "key": null, # "num_cols": 1, # "num_rows": 1, # "result": "c \n0 \n", # "scalar": 0.0 # } else: defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1) checkScalarResult(defaultInspectM1, "Result.hex") h2o.verboseprint("\nNow look at the assigned " + resultKey + " key") resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1) min_value = checkScalarResult(resultInspectM1, resultKey) return resultInspectM1, min_value
def test_GLM_covtype(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" x = "" print "Touching it with exec to trigger va to fvec (covtype.hex) , and then fvec to va (covtype2.hex)" h2o_cmd.runExec(str='%s=%s' % ('covtype2.hex', hex_key)) # hack to use the new one parseResult['destination_key'] = 'covtype2.hex' # L2 kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(num_cols) del x[response] ignored_cols_by_name = ",".join(map(str,random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': ignored_cols_by_name, } if FORCE_FAIL_CASE: params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} ### print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_covtype_exec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" h2o_cmd.runExec(str='%s[,55] = %s[,55]==1' % (hex_key, hex_key)) # L2 kwargs = { 'response': y, 'family': 'binomial', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if h2o.localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_multijob(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) # inc by 1 for R col # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate # only a problem if they share the dataset, do classification with integers. # change to factor here, to avoid the problem execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** csvPathname = importFolderPath + "/" + testFilename parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! # plus 1 for R indexing execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: "C" + str(x), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 trial = 0 # ignore 200 random cols (not the response) print "Kicking off multiple GBM jobs at once" # GBM train**************************************** if DO_FAIL: cases = [5, 10, 20, 40] else: cases = [5, 10, 20] for max_depth in cases: trial += 1 params = { 'response': "C" + str(response), 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'validation': parseTestResult['destination_key'], 'ignored_cols_by_name': ignored_cols_by_name, 'grid_parallelism': 1, 'classification': 1 if DO_CLASSIFICATION else 0, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() # can take 4 times as long with 4 jobs? gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs) trainElapsed = time.time() - trainStart print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
def test_GBM_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename # hex_key = csvFilename + "_" + str(trial) + ".hex" hex_key = "C" + str(trial) start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # GBM **************************************** if not DO_GBM: continue # make col 2 a binomial (negative numbers in src col = 2 execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col) resultExec = h2o_cmd.runExec(str=execExpr) params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': col # should be binomial from above } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) # wait for it to show up in jobs? time.sleep(2) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (GBMResult['python_%timeout']) print "\nGBMResult:", GBMResult # print "\nGBMResult:", h2o.dump_json(GBMResult) h2o.beta_features = False h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_at_all_nodes()
def test_GLM_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for hex_key csvFilenameList = [ ('covtype20x.data', 480, 'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can ### h2b.browseTheCloud() importFolderPath = "standard" for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, hex_key=hex_key) print "parse end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # this will make it fvec print "Touching %s with exec to make it fvec" % hex_key h2o_cmd.runExec(str='%s[0,]=%s[0,]' % (hex_key, hex_key)) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" x = "" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 1, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors() # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors() # L1 kwargs.update({'alpha': 1.0, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, noise=('JStack', None), **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C14', **kwargs) h2o.check_sandbox_for_errors()
def test_GBM_with_cancels(self): print "Sets h2o.beta_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath=='manyfiles-nflx-gz': if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]: if i not in x: print "x:", x print 'missing?', i x.remove(i) xIgnore.append(i) x = ",".join(map(str,x)) def colIt(x): return "C" + str(x) xIgnore = ",".join(map(colIt, xIgnore)) else: # leave one col ignored, just to see? xIgnore = 0 modelKey = "GBMGood" params = { 'destination_key': modelKey, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(20): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): kwargs['destination_key'] = 'GBMBad' + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) jobids.append(GBMFirstResult['job_key']) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs']) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_GBM_with_cancels(self): print "do import/parse with VA" h2o.beta_features = False importFolderPath = "standard" timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only( bucket="home-0xdiag-datasets", path=csvPathname, schema="local", timeoutSecs=50 ) parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="local", hex_key="c.hex", timeoutSecs=500, noPoll=False, doSummary=False, ) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {"destination_key": "c.hex"} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult["destination_key"] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath == "manyfiles-nflx-gz": if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = "c.hex[,%s]=c.hex[,%s]>15" % (response + 1, response + 1) kwargs = {"str": execExpr} resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now h2o.beta_features = True s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]: # have to add 1 for col start with 1, now. plus the C xIgnore.append("C" + str(i + 1)) else: # leave one col ignored, just to see? xIgnore = "C1" modelKey = "GBMGood" params = { "destination_key": modelKey, "ignored_cols_by_name": xIgnore, "learn_rate": 0.1, "ntrees": 2, "max_depth": 8, "min_rows": 1, "response": "C" + str(response + 1), "classification": 1 if DO_CLASSIFICATION else 0, "grid_parallelism": 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(20): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs["destination_key"] = "GBMBad" + str(i) + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult["job_key"]) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern="GBMGood", timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView["gbm_model"]["cms"][-1]["_arr"] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"]) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1) ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response+1), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response+1), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 # response = 378 response = 'C379' # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k,v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k+1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))