def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] if h2o.beta_features: # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult else: ntree = trainResult['ntree'] rfModelKey = trainResult['model_key'] start = time.time() # NOTE: response_variable is required, and passed from kwargs here # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set # compared to training kwargs['out_of_bag_error_estimate'] = 0 scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def predict_and_compare_csvs(model_key): start = time.time() predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname, msg="Original, after being exec'ed", skipHeader=True) (rowNum2, predictOutput) = compare_csv_last_col(csvPredictPathname, msg="Predicted", skipHeader=True) # no header on source if (rowNum1 != rowNum2): raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \ %s" % (rowNum1, rowNum2)) wrong = 0 wrong0 = 0 wrong1 = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): o = float(o) p = float(p) if o!=p: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) if p==0.0 and wrong0==10: print "Not printing any more predicted=0 mismatches" elif p==0.0 and wrong0<10: print msg if p==1.0 and wrong1==10: print "Not printing any more predicted=1 mismatches" elif p==1.0 and wrong1<10: print msg if p==0.0: wrong0 += 1 elif p==1.0: wrong1 += 1 wrong += 1 print "wrong0:", wrong0 print "wrong1:", wrong1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 16.0: raise Exception("pct wrong: %s too high. Expect < 16 pct error" % pctWrong)
def predict_and_compare_csvs(model_key): start = time.time() predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname, msg="Original, after being exec'ed", skipHeader=True) (rowNum2, predictOutput) = compare_csv_last_col(csvPredictPathname, msg="Predicted", skipHeader=True) # no header on source if (rowNum1 != rowNum2): raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \ %s" % (rowNum1, rowNum2)) wrong = 0 wrong0 = 0 wrong1 = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): o = float(o) p = float(p) if o!=p: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) if p==0.0 and wrong0==10: print "Not printing any more predicted=0 mismatches" elif p==0.0 and wrong0<10: print msg if p==1.0 and wrong1==10: print "Not printing any more predicted=1 mismatches" elif p==1.0 and wrong1<10: print msg if p==0.0: wrong0 += 1 elif p==1.0: wrong1 += 1 wrong += 1 print "wrong0:", wrong0 print "wrong1:", wrong1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 10.0: raise Exception("pct wrong too high. Expect < 10% error")
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] if h2o.beta_features: # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult else: ntree = trainResult['ntree'] rfModelKey = trainResult['model_key'] start = time.time() # NOTE: response_variable is required, and passed from kwargs here # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set # compared to training kwargs['out_of_bag_error_estimate'] = 0 scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult rftime = time.time()-start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y + 1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.5, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 0 }, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5, errorIfCancelled=True) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) print "glm2 end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] if not validation or 'avg_err' not in validation: raise Exception("glm: %s" % h2o.dump_json(glm) + \ "\nNo avg_err in validation." + \ "\nLikely if you look back, the job was cancelled, so there's no cross validation.") avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception( "Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess( pctWrong, 8, "Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) if 1 == 0: # stuff from GLM1 classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def test_DeepLearning_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '1024,1024,2048', 'adaptive_rate' : 1, 'rho' : 0.99, 'epsilon' : 1e-8, 'train_samples_per_iteration' : -1, ## 0: better accuracy! -1: best scalability! 10000: best accuracy? # 'rate' : 0.01, # 'rate_annealing' : 1e-6, # 'momentum_start' : 0.5, # 'momentum_ramp' : 1800000, # 'momentum_stable' : 0.99, 'l1' : 1e-5, 'l2' : 0.0, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', 'epochs' : 128, #enough for 64 nodes 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 #don't score until the end } timeoutSecs = 7200 start = time.time() deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr)
def test_c10_rel_gbm(self): print "not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Test*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' testFilename = 'classification1Test.txt' testPathname = importFolderPath + "/" + testFilename start = time.time() parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' trainFilename = 'classification1Train.txt' trainPathname = importFolderPath + "/" + trainFilename start = time.time() parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500) print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, trainPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # GBM Train*********************************************************** x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] # response = 0 # doesn't work if index is used? response = 'outcome' # x = range(inspect['num_cols']) # del x[response] ntrees = 10 # fails with 40 params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': 20, 'min_rows': 2, 'response': response, 'cols': x, # 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() modelKey = 'GBMModelKey' timeoutSecs = 900 trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # get the last cm cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_DeepLearning_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A','B',0, 14), ('A','B',1, 14), (0,1,0, 12), (0,1,1, 12), (0,1,'NaN', 12), (1,0,'NaN', 12), (-1,1,0, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' validation_key = hex_key parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] response = 'C' + str(response) kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden' : '113,71,54', 'rate' : 0.01, 'rate_annealing' : 1e-6, 'momentum_start' : 0, 'momentum_stable' : 0, 'l1' : 0.0, 'l2' : 1e-6, 'seed' : 80023842348, 'loss' : 'CrossEntropy', #'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 100, 'destination_key' : model_key, 'validation' : hex_key, } timeoutSecs = 60 start = time.time() h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time() - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.00 relTol = 0.01 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr)) trial += 1
def test_c9_GLM_airlines_fvec(self): h2o.beta_features = True files = [('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 30, 'n_folds': 3, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, 'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if h2o.beta_features: modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_DeepLearning_mnist(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '1024,1024,2048', 'adaptive_rate' : 1, 'rho' : 0.99, 'epsilon' : 1e-8, 'train_samples_per_iteration' : -1, ## 0: better accuracy! -1: best scalability! 10000: best accuracy? # 'rate' : 0.01, # 'rate_annealing' : 1e-6, # 'momentum_start' : 0.5, # 'momentum_ramp' : 1800000, # 'momentum_stable' : 0.99, 'l1' : 1e-5, 'l2' : 0.0, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', 'epochs' : 128, #enough for 64 nodes 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 #don't score until the end } timeoutSecs = 7200 start = time.time() deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } h2o.beta_features = True predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) h2o.beta_features = False
def test_GLM2_mnist(self): if not SCIPY_INSTALLED: pass else: SYNDATASETS_DIR = h2o.make_syn_dir() csvFilelist = [ (10000, 500, 'cA', 60), ] trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvFilename = 'syn_' + "binary" + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, rowCount, colCount) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** modelKey = 'GLM_model' y = colCount kwargs = { 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0, 'max_iter': 15, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } # GLM wants the output col to be strictly 0,1 integer execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % ( hex_key, y + 1, y + 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'aHack'} timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? lambdaMax = glm['glm_model']['lambda_max'] print "lambdaMax:", lambdaMax best_threshold = glm['glm_model']['submodels'][0][ 'validation']['best_threshold'] print "best_threshold", best_threshold # pick the middle one? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][ '_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above print "\nPredict\n==========\n" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='aHack', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='aHack', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 50, "Should see less than 50% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM2_ints_unbalanced(self): h2o.beta_features = True ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-8 }, { 'alpha': 0.5, 'lambda': 0.0 }, { 'alpha': 0.0, 'lambda': 0.0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) predictKey = 'Predict.hex' predictResult = h2o_cmd.runPredict(data_key="B.hex", model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual="B.hex", vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_NN_airlines_small(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'airlines_train.hex' validation_key = 'airlines_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # this gives the last col number, which is IsDepDelayed_REC (1 or -1) # response = inspect['numCols'] - 1 # this is "YES"/"NO" response = 'IsDepDelayed' #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # get the column names colNames = [c['name'] for c in inspect['cols']] print "colNames:", colNames usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance") ignoredCols = [] for c in colNames: # don't put the response in the ignore list (is there a problem if so?) if c not in usedCols and c != response: ignoredCols.append(c) ignoredColsString = ",".join(ignoredCols) print "Telling h2o to ignore these cols:" print ignoredColsString kwargs = { 'ignored_cols' : ignoredColsString, 'response' : response, 'classification' : 1, 'destination_key' : model_key, } expectedErr = 0.45 ## expected validation error for the above model relTol = 0.50 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } h2o.beta_features = True predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr)) h2o.beta_features = False
def test_GLM2_ints_unbalanced(self): h2o.beta_features = True ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-8}, {'alpha': 0.5, 'lambda': 0.0}, {'alpha': 0.0, 'lambda': 0.0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) predictKey = 'Predict.hex' predictResult = h2o_cmd.runPredict( data_key="B.hex", model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual="B.hex", vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex' ) ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c10_rel_gbm(self): h2o.beta_features = True print "not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Test*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' testFilename = 'classification1Test.txt' testPathname = importFolderPath + "/" + testFilename start = time.time() parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' trainFilename = 'classification1Train.txt' trainPathname = importFolderPath + "/" + trainFilename start = time.time() parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500) print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, trainPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # GBM Train*********************************************************** x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] # response = 0 # doesn't work if index is used? response = 'outcome' # x = range(inspect['num_cols']) # del x[response] ntrees = 10 # fails with 40 params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': 20, 'min_rows': 2, 'response': response, 'cols': x, # 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() modelKey = 'GBMModelKey' timeoutSecs = 900 trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # get the last cm cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_params_rand2(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set print "Regression!" params = {'response': 54, 'ignored_cols_by_name': '5,6,7,8,9', 'ntrees': 2, 'classification': 0} h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?" h2o.beta_features = False
def test_GLM2_enums_score_superset(self): h2o.beta_features = True print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, "cD", 300), (n, 2, "cE", 300), (n, 3, "cF", 300), (n, 4, "cG", 300), (n, 5, "cH", 300), (n, 6, "cI", 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset( csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar ) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) y = colCount modelKey = "enums" kwargs = { "destination_key": modelKey, "response": y, "max_iter": 1, "n_folds": 1, "alpha": 0.2, "lambda": 1e-5, "family": "binomial", } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt ) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = "Predict.hex" start = time.time() predictResult = h2o_cmd.runPredict( data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs ) # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc( thresholds=None, actual=scoreDataKey, predict="Predict.hex", vactual=y, vpredict=1 ) auc = resultAUC["AUC"] self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=scoreDataKey, predict=predictKey, vactual="C" + str(y + 1), vpredict="predict" ) cm = predictCMResult["cm"] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_NN_airlines_small(self): #h2b.browseTheCloud() csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'airlines_train.hex' validation_key = 'airlines_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # this gives the last col number, which is IsDepDelayed_REC (1 or -1) # response = inspect['numCols'] - 1 # this is "YES"/"NO" response = 'IsDepDelayed' #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # get the column names colNames = [c['name'] for c in inspect['cols']] print "colNames:", colNames usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance") ignoredCols = [] for c in colNames: # don't put the response in the ignore list (is there a problem if so?) if c not in usedCols and c != response: ignoredCols.append(c) ignoredColsString = ",".join(ignoredCols) print "Telling h2o to ignore these cols:" print ignoredColsString kwargs = { 'ignored_cols': ignoredColsString, 'response': response, 'classification': 1, 'destination_key': model_key, } expectedErr = 0.45 ## expected validation error for the above model relTol = 0.50 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GBM_params_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect( key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': 'C1,C2,C3,C4,C5', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_DeepLearning_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' validation_key = 'test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) parseResultV = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = 'any_response' #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # use defaults otherwise # need to change epochs otherwise it takes too long kwargs = { 'epochs' : 0.001, 'response' : response, 'destination_key' : model_key, 'validation' : validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.046 relTol = 0.35 # allow 35% tolerance. kbn predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr))
def test_GBM_covtype_train_test(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) ntrees = 2 # fails with 40 for max_depth in [40, 5]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_tmp(self): importFolderPath = "/tmp" csvFilename = 's.csv' bcFilename = 'bc.csv' csvPathname = importFolderPath + "/" + csvFilename bcPathname = importFolderPath + "/" + bcFilename hex_key = csvFilename + ".hex" bc_key = bcFilename + ".hex" # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=hex_key) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) bcResult = h2i.import_parse(path=bcPathname, schema='put', hex_key=bc_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=bc_key) print "\n" + bcPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] trainDataKey = hex_key testDataKey = hex_key # GLM, predict, CM*******************************************************8 kwargs = { 'response': "response", 'non_negative': 0, 'standardize': 1, 'strong_rules': 1, 'alpha': 0, 'max_iter': 100, 'lambda_min_ratio': -1, 'higher_accuracy': 1, 'beta_constraints': bc_key, 'link': "family_default", 'use_all_factor_levels': 0, 'variable_importances': 0, 'lambda': 0, 'prior': 0.00301875221383974, 'nlambdas': -1, 'source': hex_key, 'lambda_search': 0, 'disable_line_search': 0, 'n_folds': 0, 'family': "binomial", 'beta_epsilon': 1e-04, 'intercept': 1, 'max_predictors': -1, # "used_cols"': "4,5,18,37,38,53,66,73,90,93,95,96,112,117,135,158,165,166,168,177,180", # 'ignored_cols': "1,2,3,4,5,6,7,8,9,11,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27,29,31,32,34,35,36,37,38,40,41,42,43,44,45,46,47,48,49,51,52,53,54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,91,92,93,94,95,96,97,98,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,119,120,121,123,124,125,126,128,129,133,134,135,136,137,138,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,173,174,176,177,178,179", } timeoutSecs = 180 for trial in range(10): parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='response', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 8, "Should see less than 7% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm']
def test_GLM2_covtype_train_predict_all_all(self): importFolderPath = "standard" csvFilename = "covtype.shuffled.data" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=180 ) execExpr = "A.hex=%s" % parseResult["destination_key"] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1) # class 1 h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) print "Use same data (full) for train and test" trainDataKey = "A.hex" testDataKey = "A.hex" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { "response": "C" + str(y + 1), "max_iter": 20, "n_folds": 0, # 'alpha': 0.1, # 'lambda': 1e-5, "alpha": 0.0, "lambda": None, "family": "binomial", } timeoutSecs = 60 for trial in range(1): # test/train split **********************************************8 aHack = {"destination_key": trainDataKey} # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm["glm_model"]["_key"] submodels = glm["glm_model"]["submodels"] # hackery to make it work when there's just one validation = submodels[-1]["validation"] best_threshold = validation["best_threshold"] thresholds = validation["thresholds"] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation["_cms"] cm = cms[best_index] trainPctWrong = h2o_gbm.pp_cm_summary(cm["_arr"]) # Score ********************************************** predictKey = "Predict.hex" start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs ) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual="C" + str(y + 1), predict=predictKey, vpredict="predict" ) cm = predictCMResult["cm"] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertEqual( pctWrong, trainPctWrong, "Should see the same error rate on train and predict? (same data set)" ) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_DeepLearning_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' validation_key = 'test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) parseResultV = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = 'any_response' #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # use defaults otherwise # need to change epochs otherwise it takes too long kwargs = { 'epochs': 0.001, 'response': response, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.046 relTol = 0.35 # allow 35% tolerance. kbn predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GLM2_mnist(self): if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % ( trainKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % ( testKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 9, "Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_params_rand2(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': '0,1,2,3,4', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cms'][-1] # use the last one # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_enums_score_superset(self): h2o.beta_features = True print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' kwargs = { 'destination_key': modelKey, 'response': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'family': 'binomial' } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual=scoreDataKey, predict='Predict.hex', vactual=y, vpredict=1) auc = resultAUC['AUC'] self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=scoreDataKey, predict=predictKey, vactual='C' + str(y + 1), vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_regression_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = { 'response': 'C55', # 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0, 'validation': testKey, } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) print "gbmTrainView:", h2o.dump_json(gbmTrainView) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # for regression, the cms are all null, so don't print # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict(data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_DeepLearning_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A', 'B', 0, 14), ('A', 'B', 1, 14), (0, 1, 0, 12), (0, 1, 1, 12), (0, 1, 'NaN', 12), (1, 0, 'NaN', 12), (-1, 1, 0, 12), (-1, 1, 1, 12), (-1e1, 1e1, 1e1, 12), (-1e1, 1e1, -1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' validation_key = hex_key parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols': None, 'response': 'C' + str(response), 'classification': 1, 'activation': 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden': '500', 'rate': 0.01, 'rate_annealing': 1e-6, 'momentum_start': 0, 'momentum_stable': 0, 'l1': 0.0, 'l2': 1e-4, 'seed': 80023842348, 'loss': 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 1.0, 'destination_key': model_key, 'validation': hex_key, } timeoutSecs = 60 start = time.time() h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.001 relTol = 0.01 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr)) trial += 1
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest if h2o.beta_features: kwargs = {'str': execExpr, 'timeoutSecs': 15} else: kwargs = {'expression': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. if h2o.beta_features: paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model' } else: paramDict = drf1ParamDict params = { 'ntree': 20, 'out_of_bag_error_estimate': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() if h2o.beta_features: timeoutSecs = 30 + kwargs['ntrees'] * 60 else: timeoutSecs = 30 + kwargs['ntree'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" if h2o.beta_features: model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] else: model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C54', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GLM2_covtype20x_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', 'classification': 1, } timeoutSecs = 60 for trial in range(100): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def test_GLM_enums_unbalanced(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y+1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.5, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] if not validation or 'avg_err' not in validation: raise Exception("glm: %s" % h2o.dump_json(glm) + \ "\nNo avg_err in validation." + \ "\nLikely if you look back, the job was cancelled, so there's no cross validation.") avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception("Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) if 1==0: # stuff from GLM1 classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(num_cols) del x[response] ignored_cols_by_name = ",".join(map(str,random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': ignored_cols_by_name, } if FORCE_FAIL_CASE: params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} ### print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c9_GLM_rc_fvec(self): h2o.beta_features = True files = [ ('c16', '140k_train_anonymised.csv', 'rc.hex', 1800, None) ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() # avoid printing the coefficient names in jenkins output # the last col is the response, so we use a number to point to it below parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path=csvPathname, schema='local', hex_key=trainKey, header=0, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numRows = inspect['numRows'] numCols = inspect['numCols'] response = numCols-1 # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 0, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if h2o.beta_features: modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GBM_regression_rand2(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C54', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = {'response': 'C54', 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0} h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # for regression, the cms are all null, so don't print # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_GLM2_covtype_train_predict_all_all(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 1) # class 1 h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "Use same data (full) for train and test" trainDataKey = "A.hex" testDataKey = "A.hex" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y+1), 'max_iter': 20, 'n_folds': 0, # 'alpha': 0.1, # 'lambda': 1e-5, 'alpha': 0.0, 'lambda': None, 'family': 'binomial', } timeoutSecs = 60 for trial in range(1): # test/train split **********************************************8 aHack = {'destination_key': trainDataKey} # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertEqual(pctWrong, trainPctWrong,"Should see the same error rate on train and predict? (same data set)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if h2o.localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_mnist_reals(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x GLM will use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x modelKey = "mnist" params = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey } # for c in [0,1,2,3,4,5,6,7,8,9]: # just do a couple digits for c in [0,7]: print "Trying binomial with case:", c execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) kwargs = params.copy() timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) # Score ********************************************** execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) print "Problems with test data having different enums than train? just use train for now" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key="B.hex", model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual="B.hex", vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_NN_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'standard/covtype.shuffled.90pct.data' csvPathname_test = 'standard/covtype.shuffled.10pct.data' hex_key = 'covtype.hex' validation_key = 'covtype.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'adaptive_rate': 0, 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 96.0, 'destination_key': model_key, 'validation': validation_key, 'score_interval': 10000 } expectedErr = 0.24 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_NN_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'adaptive_rate' : 0, 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 2.0, 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 } expectedErr = 0.057 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr))
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = {'ntrees': 20, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual( classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_GLM2_covtype_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y+1), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', } timeoutSecs = 180 for trial in range(10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1) ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response+1), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response+1), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_covtype_train_test(self): h2o.beta_features = False bucket = "home-0xdiag-datasets" modelKey = "GBMModelKey" files = [ ( "standard", "covtype.shuffled.90pct.data", "covtype.train.hex", 1800, 54, "covtype.shuffled.10pct.data", "covtype.test.hex", ) ] # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False # turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse( bucket=bucket, path=importFolderPath + "/" + trainFilename, schema="local", hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False, ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult["destination_key"] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "train parse result:", parseTrainResult["destination_key"] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse( bucket=bucket, path=importFolderPath + "/" + testFilename, schema="local", hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False, ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult["destination_key"] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "test parse result:", parseTestResult["destination_key"] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult["destination_key"]) x = range(inspect["num_cols"]) del x[response] ntrees = 2 # fails with 40 for max_depth in [40, 5]: params = { "learn_rate": 0.2, "nbins": 1024, "ntrees": ntrees, "max_depth": max_depth, "min_rows": 10, "response": response, "ignored_cols_by_name": None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult["destination_key"]) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM( parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView["gbm_model"]["cms"][5] # use the mid point pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = "Predict.hex" h2o_cmd.runInspect(key=parseTestResult["destination_key"]) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult["destination_key"], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult["destination_key"], vactual=response, predict=predictKey, vpredict="predict", # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult["cms"][-1] # use the last one # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = "max_depth" eLabel = "pctWrong" fLabel = "trainElapsed" eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM2_mnist(self): h2o.beta_features = True if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y), # 'case_mode': '=', # 'case_val': 0, 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, ## 'weight': 1.0, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0,1,2,3,4,5,6,7,8,9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_selfKey'] # This seems wrong..what's the format of the cm? if 1==0: cm = glm['glm_model']['submodels'][0]['validation']['_cms'][0]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM2_covtype_train_predict_all_all(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1) # class 1 h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "Use same data (full) for train and test" trainDataKey = "A.hex" testDataKey = "A.hex" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y + 1), 'max_iter': 20, 'n_folds': 0, # 'alpha': 0.1, # 'lambda': 1e-5, 'alpha': 0.0, 'lambda': None, 'family': 'binomial', } timeoutSecs = 60 for trial in range(1): # test/train split **********************************************8 aHack = {'destination_key': trainDataKey} # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']) # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertEqual( pctWrong, trainPctWrong, "Should see the same error rate on train and predict? (same data set)" ) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_NN2_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 90 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 20.0, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.046 relTol = 0.1 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GLM2_mnist(self): if not SCIPY_INSTALLED: pass else: h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilelist = [ (10000, 500, 'cA', 60), ] trial = 0 for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + "/" + csvFilename write_syn_dataset(csvPathname, rowCount, colCount) start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** modelKey = 'GLM_model' y = colCount kwargs = { 'response': 'C' + str(y+1), 'family': 'binomial', 'lambda': 1e-4, 'alpha': 0, 'max_iter': 15, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } # GLM wants the output col to be strictly 0,1 integer execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (hex_key, y+1, y+1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) aHack = {'destination_key': 'aHack'} timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? lambdaMax = glm['glm_model']['lambda_max'] print "lambdaMax:", lambdaMax best_threshold= glm['glm_model']['submodels'][0]['validation']['best_threshold'] print "best_threshold", best_threshold # pick the middle one? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above print "\nPredict\n==========\n" predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key='aHack', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='aHack', vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 50,"Should see less than 50% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 # response = 378 response = 'C379' # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c9_GLM_airlines_hdfs(self): files = [ ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM (train)**************************************** params = { # 'lambda': 1e-4, # 'alpha': 0.5, 'lambda': 1e-8, 'alpha': 0.0, 'max_iter': 10, 'n_folds': 3, 'family': 'binomial', 'destination_key': "GLMKEY", 'response': response, 'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } kwargs = params.copy() timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs) elapsed = time.time() - start print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] best_threshold = validation['best_threshold'] thresholds = validation['thresholds'] # have to look up the index for the cm, from the thresholds list best_index = None for i,t in enumerate(thresholds): if t == best_threshold: best_index = i break cms = validation['_cms'] cm = cms[best_index] pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! should look at prediction error/class error? # self.assertLess(pctWrong, 9,"Should see less than 40% error") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm['_arr']) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=trainKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=trainKey, vactual=response, predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 40,"Should see less than 40% error") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) h2i.delete_keys_at_all_nodes(timeoutSecs=600)