def test_RF(self): h2o.beta_features = True paramsTrainRF = {"ntrees": 2, "max_depth": 300, "nbins": 200, "timeoutSecs": 600, "response": "C55"} paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600} trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) print "\nTrain1\n=========={0}".format( h2o_rf.pp_rf_result(trainResult1)) print "\nScore1\n========={0}".format( h2o_rf.pp_rf_result(scoreResult1)) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTrain2\n=========={0}".format( h2o_rf.pp_rf_result(trainResult2)) print "\nScore2\n========={0}".format( h2o_rf.pp_rf_result(scoreResult2)) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1)) print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1)) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2)) print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2)) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): normalRF = False #normalRF = True print """ Normal RF : {0} Train data: {1} Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename']) print "Loading data...." trainKey = self.loadTrainData() kwargs = paramsTrainRF.copy() print "Running normal RF: {0}".format(normalRF) if normalRF: trainResult = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs) else: trainResult = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs) scoreKey = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs) print """ Normal RF : {0} Train data: {1} Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename']) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', 'ntrees': 1, 'max_depth': 10, # 'sample_rate': 1.0, 'sample_rate': 1.0, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C55', 'classification': 1, } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # train1 trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', # if I use 100, and just one tree, I should get same results for sorted/shuffled? # i.e. the bagging always sees everything. Means oobe will be messed up # so will specify validation = the 10pct holdout data (could reuse the training data?) 'sample_rate': 1.0, 'ntrees': 3, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # 90% data trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1) self.assertEqual(4.29, classification_error1) self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1) self.assertEqual(58101, totalScores1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) # 10% data trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2) self.assertEqual(4.29, classification_error2) self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2) self.assertEqual(58101, totalScores2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_c8_rf_airlines_hdfs(self): h2o.beta_features = True trainParseResult = self.loadTrainData() kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)
def test_c8_rf_airlines_hdfs(self): trainParseResult = self.loadTrainData() kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_RF(self): trainKey = self.loadTrainData() kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainKey, **kwargs) scoreKey = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_c8_rf_airlines_hdfs(self): trainParseResult = self.loadTrainData() kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_rf_iris(self): # Train RF trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put') kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put') kwargs = paramsTestRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_RF(self): trainKey = self.loadTrainData() scoreKey = self.loadScoreData() #time.sleep(3600) executeNormalRF = True executeNormalRF = False if executeNormalRF: kwargs = paramsTrainRF.copy() trainResultNormal = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs) #print h2o_rf.pp_rf_result(trainResultNormal) kwargs = paramsScoreRF.copy() scoreResultNormal = h2o_rf.scoreRF(scoreKey, trainResultNormal, **kwargs) print "\nScoring normal forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultNormal)) kwargs = paramsTrainRF.copy() trainResultRefined = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs) #print h2o_rf.pp_rf_result(trainResultRefined) kwargs = paramsScoreRF.copy() scoreResultRefined = h2o_rf.scoreRF(scoreKey, trainResultRefined, **kwargs) print "\nScoring refined forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultRefined)) time.sleep(3600)
def test_rf_iris(self): # Train RF trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put') kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put') kwargs = paramsTestRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format( h2o_rf.pp_rf_result(scoreResult))
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 10, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data' : 1, 'ntree' : 10, 'depth' : 300, 'bin_limit' : 20000, 'stat_type' : 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit' : 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 10, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data': 1, 'ntree': 10, 'depth': 300, 'bin_limit': 20000, 'stat_type': 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit': 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', # if I use 100, and just one tree, I should get same results for sorted/shuffled? # i.e. the bagging always sees everything. Means oobe will be messed up # so will specify validation = the 10pct holdout data (could reuse the training data?) 'sample_rate': 1.0, 'ntrees': 3, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # 90% data trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1) # self.assertEqual(4.29, classification_error1) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) # 10% data trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2) # self.assertEqual(4.29, classification_error2) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 3, 'max_depth': 10, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C54', 'classification': 1, } paramsScoreRF = { 'vactual': 'C54', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data' : 1, 'ntree' : 10, 'depth' : 300, 'bin_limit' : 20000, 'stat_type' : 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit' : 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 3, 'max_depth': 10, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C54', 'classification': 1, } paramsScoreRF = { 'vactual': 'C54', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data': 1, 'ntree': 10, 'depth': 300, 'bin_limit': 20000, 'stat_type': 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit': 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) if 1 == 0: print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { "ntrees": 3, "max_depth": 10, "nbins": 50, "timeoutSecs": 600, "response": "C55", "classification": 1, } paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600} else: paramsTrainRF = { "use_non_local_data": 1, "ntree": 10, "depth": 300, "bin_limit": 20000, "stat_type": "ENTROPY", "out_of_bag_error_estimate": 1, "exclusive_split_limit": 0, "timeoutSecs": 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col "response_variable": None, "timeoutSecs": 60, "out_of_bag_error_estimate": 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key="scoreDS1.hex", verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key="Predict.hex", verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key="scoreDS2.hex", verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key="Predict.hex", verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) if 1 == 0: print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): paramsTrainRF = { 'seed': '1234567890', 'ntrees': 1, 'max_depth': 10, # 'sample_rate': 1.0, 'sample_rate': 1.0, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C55', 'classification': 1, } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # train1 trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception( "Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))