def glm_score(self, csvFilename, csvPathname, modelKey, thresholds="0.5", timeoutSecs=30, pollTimeoutSecs=30): print "\nStarting GLM score of", csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, # the results # should be similar? # UPDATE: format for returning results is slightly different than normal GLM validation = glmScore['validation'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validation, self.validations1) else: self.validations1 = copy.deepcopy(validation)
def glm_score(self, csvFilename, csvPathname, modelKey, timeoutSecs=3): print "\nStarting GLM score of", csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, timeoutSecs=timeoutSecs) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glmScore['GLMModel'] validationsList = glmScore['GLMModel']['validations'] validations = validationsList[0] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-8}, {'alpha': 0.5, 'lambda': 0.0}, {'alpha': 0.0, 'lambda': 0.0}, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iterations > 20: raise Exception("Why take so many iterations: %s in this glm training?" % iterations) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_c10_rel_glm(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices # since we're no long zero based, increment by 1 x_from_zero = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] x = ['C' + str(i + 1) for i in x_from_zero] y = 0 # GLM Train*********************************************************** keepPattern = None # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "from goodX (not used) x:", x print "y:", y # have to use named cols, and they start with 1 kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" # GLMScore Test*********************************************************** start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glmScore end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
def test_GLM_enums_score_subset(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc
def test_GLM_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_mnist_s3n(self): URI = "s3n://home-0xdiag-datasets/mnist/" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] # IMPORT********************************************** importHDFSResult = h2o.nodes[0].import_hdfs(URI) ### print "importHDFSResult:", h2o.dump_json(importHDFSResult) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** s3nKey = URI + testCsvFilename testKey2 = testCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, testKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # PARSE train**************************************** s3nKey = URI + trainCsvFilename trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, trainKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] kwargs = {'x': x, 'y': y, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) print "GLMScore in", (time.time() - start), "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o.verboseprint(h2o.dump_json(glmScore))
def test_GLM_enums_score_superset(self): print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] print "\n" + csvFilename missingValuesDict = h2o_cmd.check_enums_from_inspect(parseKey) if missingValuesDict: m = [str(k) + ":" + str(v) for k,v in missingValuesDict.iteritems()] raise Exception("Looks like columns got flipped to NAs: " + ", ".join(m)) y = colCount kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc
def test_GLM_mnist_s3n(self): csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] importFolderPath = "mnist" csvPathname = importFolderPath + "/*" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList), 1, "Should see more than 1 files in s3n?") trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testHexKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # PARSE train**************************************** csvPathname = importFolderPath + "/" + trainCsvFilename trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] kwargs = {'x': x, 'y': y, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=testHexKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) print "GLMScore in", (time.time() - start), "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o.verboseprint(h2o.dump_json(glmScore))
def test_GLM_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
} timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" # GLMScore Test*********************************************************** start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glmScore end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' if __name__ == '__main__': h2o.unit_main()
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row): debug = False bench = "bench" if debug: print "DOING GLM DEBUG" bench = "bench/debug" date = '-'.join([str(z) for z in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" glmbenchcsv = 'benchmarks/' + build + '/' + pre + 'glmbench.csv' if not os.path.exists(glmbenchcsv): output = open(glmbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(glmbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200, doSummary=False) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update({ 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nTrainRows': inspect_train['num_rows'], 'nTestRows': inspect_test['num_rows'], 'nCols': inspect_train['num_cols'], 'trainParseWallTime': parseWallTime, 'nfolds': nfolds, 'family': family, }) params = { 'y': y, 'x': x, 'family': family, 'link': link, 'lambda': lambda_, 'alpha': alpha, 'n_folds': nfolds, 'case_mode': "n/a", 'destination_key': "GLM(" + f + ")", 'expert_settings': 0, } kwargs = params.copy() glmStart = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=7200, **kwargs) glmTime = time.time() - glmStart row.update({ 'glmBuildTime': glmTime, #'AverageErrorOver10Folds' : glm['GLMModel']['validations'][0]['err'], }) glmScoreStart = time.time() glmScore = h2o_cmd.runGLMScore(key=testFilehex, model_key=params['destination_key'], timeoutSecs=1800) scoreTime = time.time() - glmScoreStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) if family == "binomial": row.update({ 'scoreTime': scoreTime, 'AUC': glmScore['validation']['auc'], 'AIC': glmScore['validation']['aic'], 'error': glmScore['validation']['err'], }) else: row.update({ 'scoreTime': scoreTime, 'AIC': glmScore['validation']['aic'], 'AUC': 'NA', 'error': glmScore['validation']['err'], }) csvWrt.writerow(row) finally: output.close()
def test_GLM2_mnist(self): h2o.beta_features = True if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 'C0' # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX params = { 'ignored_cols': ignoreX, 'response': y, 'case_mode': '=', 'case_val': 0, 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 5, ## 'thresholds': 0.5, ## 'weight': 1.0, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } if DO_ALL_DIGITS: cases = [0,1,2,3,4,5,6,7,8,9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case_val'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y+1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.5, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 0}, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5, errorIfCancelled=True) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] if not validation or 'avg_err' not in validation: raise Exception("glm: %s" % h2o.dump_json(glm) + \ "\nNo avg_err in validation." + \ "\nLikely if you look back, the job was cancelled, so there's no cross validation.") avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iterations > 20: raise Exception("Why take so many iterations: %s in this glm2 training?" % iterations) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row): debug = False bench = "bench" if debug: print "DOING GLM DEBUG" bench = "bench/debug" date = '-'.join([str(z) for z in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" glmbenchcsv = 'benchmarks/'+build+'/'+pre+'glmbench.csv' if not os.path.exists(glmbenchcsv): output = open(glmbenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(glmbenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 7200, retryDelaySecs = 5, pollTimeoutSecs = 7200, doSummary = False ) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update( {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nTrainRows' : inspect_train['num_rows'], 'nTestRows' : inspect_test['num_rows'], 'nCols' : inspect_train['num_cols'], 'trainParseWallTime' : parseWallTime, 'nfolds' : nfolds, 'family' : family, }) params = {'y' : y, 'x' : x, 'family' : family, 'link' : link, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, 'case_mode' : "n/a", 'destination_key' : "GLM("+f+")", 'expert_settings' : 0, } kwargs = params.copy() glmStart = time.time() glm = h2o_cmd.runGLM(parseResult = parseResult, timeoutSecs = 7200, **kwargs) glmTime = time.time() - glmStart row.update( {'glmBuildTime' : glmTime, #'AverageErrorOver10Folds' : glm['GLMModel']['validations'][0]['err'], }) glmScoreStart = time.time() glmScore = h2o_cmd.runGLMScore(key = testFilehex, model_key = params['destination_key'], timeoutSecs = 1800) scoreTime = time.time() - glmScoreStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) if family == "binomial": row.update( {'scoreTime' : scoreTime, 'AUC' : glmScore['validation']['auc'], 'AIC' : glmScore['validation']['aic'], 'error' : glmScore['validation']['err'], }) else: row.update( {'scoreTime' : scoreTime, 'AIC' : glmScore['validation']['aic'], 'AUC' : 'NA', 'error' : glmScore['validation']['err'], }) csvWrt.writerow(row) finally: output.close()
def test_GLM_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600)] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse( path=csvPathname, schema="hdfs", hex_key=testKey, key2=testKey, timeoutSecs=timeoutSecs ) elapsed = time.time() - start print "parse end on ", testCsvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() csvPathname = importFolderPath + "/" + trainCsvFilename parseResult = h2i.import_parse( path=csvPathname, schema="hdfs", hex_key=trainKey, key2=trainKey, timeoutSecs=timeoutSecs ) elapsed = time.time() - start print "parse end on ", trainCsvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300) print "x:", x params = { "x": x, "y": y, "case_mode": "=", "case": 0, "family": "binomial", "lambda": 1.0e-5, "alpha": 0.0, "max_iter": 5, "thresholds": 0.5, "n_folds": 1, "weight": 1, "beta_epsilon": 1.0e-4, } for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs["case"] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm["GLMModel"] modelKey = GLMModel["model_key"] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, "cD", 300), (n, 2, "cE", 300), (n, 4, "cF", 300), (n, 8, "cG", 300), (n, 16, "cH", 300), (n, 32, "cI", 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset( csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar ) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) y = colCount kwargs = { "y": y, "max_iter": 200, "family": "binomial", "n_folds": 10, "alpha": 0, "lambda": 0, "thresholds": 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ {"alpha": 0.5, "lambda": 1e-4}, {"alpha": 0.25, "lambda": 1e-6}, {"alpha": 0.0, "lambda": 1e-8}, {"alpha": 0.5, "lambda": 0.0}, {"alpha": 0.0, "lambda": 0.0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" GLMModel = glm["GLMModel"] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel["iterations"] modelKey = GLMModel["model_key"] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # if iterations > 20: # raise Exception("Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt ) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult["destination_key"], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs ) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" ### print h2o.dump_json(glmScore) classErr = glmScore["validation"]["classErr"] auc = glmScore["validation"]["auc"] err = glmScore["validation"]["err"] nullDev = glmScore["validation"]["nullDev"] resDev = glmScore["validation"]["resDev"] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation["resDev"]) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_GLM_mnist(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_c10_rel_glm(self): h2o.beta_features = False print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' csvFilename = 'classification1Train.txt' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices # since we're no long zero based, increment by 1 x_from_zero = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] x = ['C' + str(i + 1) for i in x_from_zero] y = 0 # GLM Train*********************************************************** keepPattern = None # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "from goodX (not used) x:", x print "y:", y # have to use named cols, and they start with 1 kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Parse Test*********************************************************** GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] csvFilename = 'classification1Test.txt' csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" # GLMScore Test*********************************************************** start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glmScore end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
def test_GLM_covtype_train(self): print "\nMichal will hate me for another file needed: covtype.shuffled.data" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9] + 1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) kwargs = { 'y': 54, 'max_iter': 20, 'n_folds': 0, 'thresholds': 0.5, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', 'case_mode': '=', 'case': 2 } timeoutSecs = 60 for trial in range(10): # always slice from the beginning rowsToUse = rowsForPct[trial % 10] resultKey = "r" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str( rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=dataKeyTest, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glmScore end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / num_rows), "pct. of all rows"
def test_GLM_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-8 }, { 'alpha': 0.5, 'lambda': 0.0 }, { 'alpha': 0.0, 'lambda': 0.0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # if iterations > 20: # raise Exception("Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def test_GLM_mnist_s3n(self): URI = "s3n://home-0xdiag-datasets/mnist/" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] # IMPORT********************************************** importHDFSResult = h2o.nodes[0].import_hdfs(URI) ### print "importHDFSResult:", h2o.dump_json(importHDFSResult) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 1, "Should see more than 1 files in s3n?") trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** s3nKey = URI + testCsvFilename testKey2 = testCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, testKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # PARSE train**************************************** s3nKey = URI + trainCsvFilename trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, trainKey2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] kwargs = {'x': x, 'y': y, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) print "GLMScore in", (time.time() - start), "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o.verboseprint(h2o.dump_json(glmScore))
def test_GLM_mnist_s3n(self): csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] importFolderPath = "mnist" csvPathname = importFolderPath + "/*" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testHexKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # PARSE train**************************************** csvPathname = importFolderPath + "/" + trainCsvFilename trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] kwargs = {'x': x, 'y': y, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=testHexKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) print "GLMScore in", (time.time() - start), "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o.verboseprint(h2o.dump_json(glmScore))
def test_GLM_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_covtype_train(self): print "\nMichal will hate me for another file needed: covtype.shuffled.data" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) kwargs = { 'y': 54, 'max_iter': 20, 'n_folds': 0, 'thresholds': 0.5, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', 'case_mode': '=', 'case': 2 } timeoutSecs = 60 for trial in range(10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=dataKeyTest, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glmScore end on ", dataKeyTest, 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"
def test_GLM_enums_score_subset(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc