def test_RF(self): trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) print "\nTrain1\n=========={0}".format( h2o_rf.pp_rf_result(trainResult1)) print "\nScore1\n========={0}".format( h2o_rf.pp_rf_result(scoreResult1)) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTrain2\n=========={0}".format( h2o_rf.pp_rf_result(trainResult2)) print "\nScore2\n========={0}".format( h2o_rf.pp_rf_result(scoreResult2)) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_exec2_runif(self): print "h2o syntax is not full R. Doesn't take min/max interval params. assumed 0/1 interval" print " just one param, it must be a column or row vector. Result is same length" print " R allows a scalar to be param" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) 'r0.hex = r.hex[,1]', 's0.hex = runif(r.hex[,1],-1)', 's1.hex = runif(r.hex[,1],-1)', 's2.hex = runif(r.hex[,1],-1)', # error. this causes exception # 's3.hex = runif(nrow(r.hex), -1)', ] results = [] for execExpr in execExprList: start = time.time() (resultExec, result) = h2e.exec_expr( execExpr=execExpr, timeoutSecs=30) # unneeded but interesting results.append(result) print "exec end on ", "operators", 'took', time.time( ) - start, 'seconds' print "exec result:", result print "exec result (full):", h2o.dump_json(resultExec) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1 == 0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [ 0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0 ])
def test_rf_covtype_train_oobe(self): print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False) print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True) print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False) print "rfv3, from covtype.sorted.data" print h2o.dump_json(rfv3) print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) print "df.difference:", h2o.dump_json(df.difference) ## self.assertEqual(len(df.difference), 0, ## msg="Want 0 , not %d differences between the two rfView json responses. %s" % \ ## (len(df.difference), h2o.dump_json(df.difference))) ce1 = rfv1['confusion_matrix']['classification_error'] ce3 = rfv3['confusion_matrix']['classification_error'] self.assertAlmostEqual( ce1, ce3, places=3, msg="classication error %s isn't close to that when sorted %s" % (ce1, ce3))
def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex', col=0)), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1)), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1 == 0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [ 0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0 ])
def test_rf_covtype_train_oobe_fvec(self): h2o.beta_features = True print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" print "rfv1:", h2o.dump_json(rfv1) print "rfv3:", h2o.dump_json(rfv3) # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) df = h2o_util.JsonDiff(rfv1, rfv3) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual( ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual( ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) self.assertAlmostEqual( ce1pct1, ce2pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual( ce1pct1, ce3pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
def test_rf_big1_overwrite_model_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['ntrees'] = 1 + jobDispatch else: kwargs['ntrees'] = 1 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['_dataKey'] = hex_key rfView['_key'] = model_key print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=False) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))
def test_exec2_na2mean(self): h2o.beta_features = True print "https://0xdata.atlassian.net/browse/PUB-228" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) 'rcnt = c(0)', 'total = c(0)', 'mean = c(0)', 's.hex = r.hex', "x=r.hex[,1]; rcnt=nrow(x)-sum(is.na(x))", "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x))", "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt", "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt; x=ifelse(is.na(x),mean,x)", ] execExprList2 = [ "s.hex = apply(r.hex,2," + "function(x){total=sum(ifelse(is.na(x),0,x)); " + \ "rcnt=nrow(x)-sum(is.na(x)); " + \ "mean=total / rcnt; " + \ "ifelse(is.na(x),mean,x)} " + \ ")" , # this got an exception. note I forgot to assign to x here "s=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)", # throw in a na flush to 0 "x=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)", ] execExprList += execExprList2 results = [] for execExpr in execExprList: start = time.time() (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # unneeded but interesting results.append(result) print "exec end on ", "operators" , 'took', time.time() - start, 'seconds' print "exec result:", result print "exec result (full):", h2o.dump_json(resultExec) h2o.check_sandbox_for_errors() # compare it to summary rSummary = h2o_cmd.runSummary(key='r.hex', cols='0') h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s.hex', cols='0') h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0, 1859.0, 1859.0])
def test_rf_big1_nopoll(self): csvFilename = 'hhp_107_01.data.gz' csvPathname = h2o.find_file("smalldata/" + csvFilename) key2 = csvFilename + ".hex" print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=15) rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete nopoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) kwargs['ntree'] = 7 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntree'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runRFOnly(node=randomNode, parseKey=parseKey, model_key=model_key, timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = model_key rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True) print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 10, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data': 1, 'ntree': 10, 'depth': 300, 'bin_limit': 20000, 'stat_type': 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit': 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', # if I use 100, and just one tree, I should get same results for sorted/shuffled? # i.e. the bagging always sees everything. Means oobe will be messed up # so will specify validation = the 10pct holdout data (could reuse the training data?) 'sample_rate': 1.0, 'ntrees': 3, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # 90% data trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1) # self.assertEqual(4.29, classification_error1) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) # 10% data trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2) # self.assertEqual(4.29, classification_error2) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF(self): paramsTrainRF = { 'seed': '1234567890', 'ntrees': 1, 'max_depth': 10, # 'sample_rate': 1.0, 'sample_rate': 1.0, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C55', 'classification': 1, } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # train1 trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception( "Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 3, 'max_depth': 10, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C54', 'classification': 1, } paramsScoreRF = { 'vactual': 'C54', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data': 1, 'ntree': 10, 'depth': 300, 'bin_limit': 20000, 'stat_type': 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit': 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) if 1 == 0: print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_parse_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() # just do the import folder once importFolderPath = "libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 0, 9.0, False, False), ("covtype.binary.svm", "cC", 30, 1, 2.0, True, True), # multi-label target like 1,2,5 ..not sure what that means # ("tmc2007_train.svm", "cJ", 30, 0, 21.0, False, False), # illegal non-ascending cols # ("syn_6_1000_10.svm", "cK", 30, -36, 36, True, False), # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False), # fails csvDownload ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False), ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False), ("news20.svm", "cH", 30, 1, 20.0, False, False), ("connect4.svm", "cB", 30, -1, 1.0, False, False), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False) ("gisette_scale.svm", "cF", 30, -1, 1.0, False, False), ("mushrooms.svm", "cG", 30, 1, 2.0, False, False), ] ### csvFilenameList = random.sample(csvFilenameAll,1) ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, hex_key, timeoutSecs, expectedCol0Min, expectedCol0Max, enableDownloadReparse, enableSizeChecks) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvPathname, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT****************************************** start = time.time() inspectFirst = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspectFirst, csvFilename) # look at the min/max for the target col (0) and compare to expected for the dataset imin = float(inspectFirst['cols'][0]['min']) # print h2o.dump_json(inspectFirst['cols'][0]) imax = float(inspectFirst['cols'][0]['max']) if expectedCol0Min: self.assertEqual( imin, expectedCol0Min, msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min)) if expectedCol0Max: h2o_util.assertApproxEqual( imax, expectedCol0Max, tol=0.00000001, msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max)) print "\nmin/max for col0:", imin, imax # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone if DO_SUMMARY: goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) if DO_DOWNLOAD_REPARSE and enableDownloadReparse: missingValuesListA = h2o_cmd.infoFromInspect( inspectFirst, csvPathname) num_colsA = inspectFirst['num_cols'] num_rowsA = inspectFirst['num_rows'] row_sizeA = inspectFirst['row_size'] value_size_bytesA = inspectFirst['value_size_bytes'] # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv" print "Trying csvDownload of", csvDownloadPathname h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o # don't have to now. we use a new name for hex_keyB # h2o.nodes[0].remove_key(hex_key) start = time.time() hex_keyB = hex_key + "_B" parseResultB = h2o_cmd.parseResult = h2i.import_parse( path=csvDownloadPathname, schema='put', hex_key=hex_keyB) print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \ csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_keyB) missingValuesListB = h2o_cmd.infoFromInspect( inspect, csvPathname) num_colsB = inspect['num_cols'] num_rowsB = inspect['num_rows'] row_sizeB = inspect['row_size'] value_size_bytesB = inspect['value_size_bytes'] df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True) print "df.difference:", h2o.dump_json(df.difference) for i, d in enumerate(df.difference): # ignore mismatches in these # "variance" # "response.time" # "key" if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d: pass else: raise Exception( "testing %s, found unexpected mismatch in df.difference[%d]: %s" % (csvPathname, i, d)) if DO_SIZE_CHECKS and enableSizeChecks: # if we're allowed to do size checks. ccompare the full json response! print "Comparing original inspect to the inspect after parsing the downloaded csv" # vice_versa=True # ignore the variance diffs. reals mismatch when they're not? filtered = [ v for v in df.difference if not 'variance' in v ] self.assertLess(len(filtered), 3, msg="Want < 3, not %d differences between the two rfView json responses. %s" % \ (len(filtered), h2o.dump_json(filtered))) # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen # make the check conditional based on the dataset self.assertEqual( row_sizeA, row_sizeB, "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB)) h2o_util.assertApproxEqual( value_size_bytesA, value_size_bytesB, tol=0.00000001, msg= "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB)) print "missingValuesListA:", missingValuesListA print "missingValuesListB:", missingValuesListB self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( num_colsA, num_colsB, "num_cols mismatches after re-parse of downloadCsv result %d %d" % (num_colsA, num_colsB)) self.assertEqual( num_rowsA, num_rowsB, "num_rows mismatches after re-parse of downloadCsv result %d %d" % (num_rowsA, num_rowsB)) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} model_key = "" if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'SRF_model' else: model_key = 'SRF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) kwargs['response'] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch print "\n MODEL KEY: ", model_key rfViewInitial.append( h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)) # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) model_key = rfView["speedrf_model"]['_key'] ntree = rfView["speedrf_model"]["parameters"]['ntrees'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_rf_covtype_train_oobe_fvec(self): print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False, expectedAuc=0.95) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True, expectedAuc=0.95) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False, expectedAuc=0.95) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" print "rfv1:", h2o.dump_json(rfv1) print "rfv3:", h2o.dump_json(rfv3) # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) df = h2o_util.JsonDiff(rfv1, rfv3) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual( ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual( ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) # we're doing separate test/train splits..so we're going to get variance # really should not do test/train split and use all the data? if we're comparing sorted or not? # but need the splits to be sorted or not. I think I have those files self.assertAlmostEqual( ce1pct1, ce2pct1, delta=10.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual( ce1pct1, ce3pct1, delta=10.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))