def test_parse_covtype_2(self): tryList = [ ('covtype.data', 1, 30), # ('covtype20x.data', 20, 120), ] for (csvFilename, multiplyExpected, timeoutSecs) in tryList: for trial in range(16,24): # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) importFolderPath = "standard" hex_key = 'covtype.hex' csvPathname = importFolderPath + "/" + csvFilename chunk_size = 2**trial print "Trial %s. Trying chunk_size %s (power of 2)" % (trial, chunk_size) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs, hex_key=hex_key, chunk_size=chunk_size, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(1): co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] # print "parseResult:", dump_json(parseResult) a_node = h2o.nodes[0] frames_result = a_node.frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def test_parse_nfs(self): print "run as user 0xcustomer on machine with nfs /mnt/0xcustomer-datasets/c1" tryList = [ ('iris2.csv', 'iris2.hex', 1, 30), ] for (csvFilename, hex_key, multiplyExpected, timeoutSecs) in tryList: importFolderPath = "/mnt/0xcustomer-datasets/c1" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=timeoutSecs, hex_key=hex_key, chunk_size=4194304 / 2, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=150 * multiplyExpected, expectedNumCols=5, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(0): print "Summary on column", i co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] frames_result = h2o.nodes[0].frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def test_delete_all_keys(self): # FIX! should have some model keys in here too, from RF etc. importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ "covtype.data", "covtype20x.data", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll for trial in range(2): for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList h2i.delete_keys_at_all_nodes() print "Delete all keys. Shouldn't be any more?" h2o.nodes[0].remove_all_keys() print "\nTrial", trial, "completed\n"
def test_parse_100k_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10, 1000, 'cA', 200, 200), (10, 2000, 'cA', 200, 200), (10, 4000, 'cA', 200, 200), (10, 8000, 'cA', 200, 200), (10, 9000, 'cA', 200, 200), (10, 10000, 'cA', 200, 200), (10, 100000, 'cA', 200, 200), # (10, 200000, 'cB', 200, 200), # (10, 300000, 'cB', 200, 200), # we timeout/fail on 500k? stop at 200k # (10, 500000, 'cC', 200, 200), # (10, 1000000, 'cD', 200, 360), # (10, 1100000, 'cE', 60, 100), # (10, 1200000, 'cF', 60, 120), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse( path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, columnNames=None, intermediateResults=DO_INTERMEDIATE_RESULTS) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print "Skipping the delete keys for now" if 1 == 0: # if not h2o.browse_disable: # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") # time.sleep(5) h2i.delete_keys_at_all_nodes()
def test_split_frame(self): csvFilename = 'iris.csv' csvPathname = 'iris/' + csvFilename hex_key = "iris.hex" parseResultA = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, timeoutSecs=10) print "Just split away and see if anything blows up" splitMe = hex_key pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key print "Just split away and see if anything blows up" splitMe = hex_key iA = h2o_cmd.InspectObj(splitMe) origNumRows = iA.numRows origNumCols = iA.numCols for s in range(10): iA = h2o_cmd.InspectObj(splitMe) numRows = iA.numRows fsResult = h2o.n0.split_frame(dataset=splitMe, ratios='[0.5]') fs = OutputObj(fsResult, 'split_frame') d = fs.jobs[0].destination_frames # modelResult = h2o.n0.models(key=model_key) # model = OutputObj(modelResult['models'][0]['output'], 'split_frame') # print "model:", dump_json(model) split_keys = [split.name for split in d] # modelResult = h2o.n0.models(key=model_key) # model = OutputObj(modelResult['models'][0]['output'], 'split_frame') # print "model:", dump_json(model) # split_keys = [split._key.name for split in model.splits] iB = h2o_cmd.InspectObj(split_keys[0]) iC = h2o_cmd.InspectObj(split_keys[1]) numCols = iB.numCols split0_rows = iB.numRows split1_rows = iC.numRows # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split_keys[1] # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols)
def test_frame_split(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResultA = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key print "Just split away and see if anything blows up" splitMe = hex_key iA = h2o_cmd.InspectObj(splitMe) origNumRows = iA.numRows origNumCols = iA.numCols for s in range(20): iA = h2o_cmd.InspectObj(splitMe) numRows = iA.numRows fsResult = h2o.n0.frame_split(training_frame=splitMe, ratios='[0.5]') fs = OutputObj(fsResult, 'frame_split') model_key = fs.jobs[0].dest.name modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'frame_split') # print "model:", dump_json(model) split_keys = [split._key.name for split in model.splits] iB = h2o_cmd.InspectObj(split_keys[0]) iC = h2o_cmd.InspectObj(split_keys[1]) numCols = iB.numCols split0_rows = iB.numRows split1_rows = iC.numRows # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split_keys[1] # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols) if split1_rows <= 1: break
def test_exec2_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key) k = Key(hex_key) colResultList = [] for i in range(pA.numCols): result = Expr(Fcn('sum', k[:, i], True)).result colResultList.append(result) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
def test_parse_covtype(self): tryList = [ ('covtype.data', 1, 30), ('covtype20x.data', 20, 120), ] for (csvFilename, multiplyExpected, timeoutSecs) in tryList: # h2o-dev doesn't take ../.. type paths? make find_file return absolute pathj a_node = h2o.nodes[0] importFolderPath = os.path.expanduser( "~/home-0xdiag-datasets/standard") csvPathname = importFolderPath + "/" + csvFilename importResult = a_node.import_files(path=csvPathname) # print "importResult:", dump_json(importResult) hex_key = importResult['destination_frames'][0] if CAUSE_FAIL: frames_result = a_node.frames(key=k, row_count=5, timeoutSecs=timeoutSecs) # print "frames_result from the first importResult key", dump_json(frames_result) parseResult = a_node.parse(key=hex_key, timeoutSecs=timeoutSecs, chunk_size=4194304 * 4) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=581012 * multiplyExpected, expectedNumCols=55, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(0): print "Summary on column", i co = h2o_cmd.runSummary(key=hex_key, column=i) k = parseResult['frames'][0]['frame_id']['name'] # print "parseResult:", dump_json(parseResult) frames_result = a_node.frames(key=k, row_count=5) # print "frames_result from the first parseResult key", dump_json(frames_result) parseKeyIndexedCheck(frames_result, multiplyExpected)
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 100, 'cG', 400), (200000, 100, 'cH', 400), (400000, 100, 'cI', 400), (800000, 100, 'cJ', 400), (1000000, 100, 'cK', 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES - 1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) iA = h2o_cmd.InspectObj(pA.parse_key) parseElapsed = pA.python_elapsed parse_key = pA.parse_key byteSize = pA.byteSize numRows = iA.numRows numCols = iA.numCols print parse_key, parseElapsed, byteSize, numRows, numCols labelList = iA.labelList node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(byteSize) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1 == 1: xLabel = 'byteSize' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GLM_basic_1(self): importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # loop, to see if we get same centers labelListUsed = list(labelList) labelListUsed.remove('STR') labelListUsed.remove('FNDX') # response removed also numColsUsed = numCols - 2 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? # glm parameters: # model_id Key<Model> False None [] # training_frame Key<Frame> False None [] # validation_frame Key<Frame> False None [] # ignored_columns string[] False None [] # drop_na20_cols boolean False False [] # score_each_iteration boolean False False [] # response_column VecSpecifier False None [] # balance_classes boolean False False [] # class_sampling_factors float[] False None [] # max_after_balance_size float False 5.0 [] # max_confusion_matrix_size int False 20 [] # max_hit_ratio_k int False 10 [] # family enum False gaussian [u'gaussian', u'binomial', u'poisson', u'gamma'] # solver enum False IRLSM [u'AUTO', u'IRLSM', u'L_BFGS'] # alpha double[] False None [] # lambda double[] False None [] # lambda_search boolean False False [] # lambda_min_ratio double False -1.0 [] # nlambdas int False -1 [] # standardize boolean False True [] # max_iterations int False -1 [] # beta_epsilon double False 0.0001 [] # link enum False family_default [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # prior double False -1.0 [] # use_all_factor_levels boolean False False [] # beta_constraints Key<Frame> False None [] # max_active_predictors int False -1 [] parameters = { 'ignored_columns': '["STR"]', 'response_column': 'FNDX', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'alpha': '[1e-4]', 'lambda': '[0.5]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, # 'use_all_factor_levels': False, } model_key = 'benign_glm.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') if 1 == 0: cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') print "" for i, c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_w2v_basic_1(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 500000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, checkHeader=1, delete_on_done=0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) cA = h2o_test.OutputObj(iA.columns[0], "inspect_column") parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(colCount): print cA.type, cA.missing self.assertEqual( 0, cA.missing, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, cA.missing)) self.assertEqual( 'string', cA.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, cA.type)) if DO_SUMMARY: for i in range(colCount): co = h2o_cmd.runSummary(key=parse_key, column=i) print co.label, co.type, co.missing, co.domain, sum( co.bins) self.assertEqual( 0, co.missing, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, co.missing)) self.assertEqual( 'String', co.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, co.type)) # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'score_each_iteration': None, # boolean false [] 'minWordFreq': 5, # int 5 [] 'wordModel': 'SkipGram', # enum [u'CBOW', u'SkipGram'] 'normModel': 'HSM', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 5, # int 5 [] 'vecSize': 100, # int 100 'windowSize': 5, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model(algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') # not implemented? # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_bayes_basic(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' b = Key(train_key) model_key = 'bayesModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename # FIX! do I need to force enum for classification? what if I do regression after this? columnTypeDict = {54: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, schema='local', chunk_size=4194304, hex_key=train_key, timeoutSecs=timeoutSecs) # don't have to make it enum, if 0/1 (can't operate on enums like this) # make 1-7 go to 0-6. 0 isn't there. # make 1 thru 6 go to 1 # change columnTypeDict to None above if I do this # Assign(b[:,54], b[:,54]-1) # Assign(b[:,54], b[:,54]!=0) # now we have just 0 and 1 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols # run through a couple of parameter sets parameters = [] parameters.append({ 'response_column': 'C55', # still 1-55 on colnames }) # just default model_key = 'covtype_bayes.hex' for p in parameters: bmResult = h2o.n0.build_model(algo='naivebayes', destination_key=model_key, training_frame=train_key, validation_frame=train_key, parameters=p, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename # we can force a col type to enum now? with param columnTypes # "Numeric" # make the last column enum # Instead of string for parse, make this a dictionary, with column index, value # that's used for updating the ColumnTypes array before making it a string for parse columnTypeDict = {10: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(10): print "Summary on column", i # FIX! how come only 0 works here for column co = h2o_cmd.runSummary(key=parse_key, column=i) for k,v in co: print k, v expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('C11') numColsUsed = numCols - 1 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': 'C11', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, 'n_folds': 1, } start = time.time() model_key = 'hastie_glm.hex' bmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') # compare this glm to the first one. since the files are replications, the results # should be similar? if self.validation1: h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1) else: # self.validation1 = copy.deepcopy(validation) self.validation1 = None
def test_GLM_error1(self): importFolderPath = "covtype" csvFilename = "covtype.20k.data" hex_key = "covtype20k.hex" binomial_key = "covtype20k.b.hex" b = Key(hex_key) csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) ## columnTypeDict = {54: 'Enum'} columnTypeDict = None parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=binomial_key, columnTypeDict=columnTypeDict, check_header=1, timeoutSecs=180, doSummary=False) # don't have to make it enum, if 0/1 (can't operate on enums like this) # make 1-7 go to 0-6. 0 isn't there. Assign(b[:, 54], b[:, 54] - 1) # make 1 thru 6 go to 1 Assign(b[:, 54], b[:, 54] != 0) # now we have just 0 and 1 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # loop, to see if we get same centers labelListUsed = list(labelList) numColsUsed = numCols for trial in range(5): parameters = { 'response_column': 'C55', 'max_iterations': 3, 'solver': 'L_BFGS', 'ignored_columns': '["C1"]', 'alpha': '[0.1]', 'max_after_balance_size': 1000.0, 'class_sampling_factors': '[0.2]', # 'use_all_factor_levels': None, 'lambda': '[0]', } bHack = hex_key co = h2o_cmd.runSummary(key=binomial_key, column=54) print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) co = h2o_cmd.runSummary(key=hex_key, column=54) print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) model_key = 'rand_glm.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=bHack, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowNaN=True) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # FIX! when is this legal doClassification = False if doClassification: mcms = OutputObj( {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) if doClassification: thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1 == 0: print "" for i, c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GBMGrid_basic_many(self): trainFilename = 'prostate.csv' train_key = 'prostate.hex' timeoutSecs = 300 csvPathname = "logreg/" + trainFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=train_key, schema='put') pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': "['ID']", # this has to have [] 'response_column': 'CAPSULE', # 'balance_classes': # 'max_after_balance_size': # ?? # 'ntrees': '[8, 10]', 'ntrees': 8, # 'max_depth': '[8, 9]', 'max_depth': 8, # ?? # 'min_rows': '[1, 2]', 'min_rows': 1, 'nbins': 40, # ?? # 'learn_rate': "[0.1, 0.2]", 'learn_rate': 0.1, # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': } jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 for i in range(5): modelKey = 'GBMGrid_prostate_%s', i bmResult = h2o.n0.build_model( algo='gbm', destination_key=modelKey, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') print "GBMResult:", h2o.dump_json(bm) # FIX! is this right for gridded? job_key = bm.jobs[0].key.name # FIX! this isn't a full formed name (%) model_key = bm.jobs[0].dest.name jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs for job_key, model_key in jobs: modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GLM_many_cols_4(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (100000, 10, 'cA', 600), (100000, 100, 'cA', 600), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) print "labelListUsed", labelListUsed response = labelListUsed[-1] labelListUsed.remove(response) numColsUsed = numCols - 1 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': response, # can't take index now? # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, 'n_folds': 1, } model_key = 'many_cols_glm.hex' bmResult = h2o.n0.build_model(algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_DL_basic(self): h2o.nodes[0].remove_all_keys() importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # no cols ignored labelListUsed = list(labelList) labelListUsed.remove('STR') numColsUsed = numCols - 1 for trial in range(1): parameters = { # required now # loss enum True None [u'MeanSquare', u'CrossEntropy'] 'loss': 'CrossEntropy', 'validation_frame': parse_key, # KeyIndexed None 'ignored_columns': '["STR"]', # string[] None 'response_column': 'FNDX', # string None 'balance_classes': None, # boolean false 'max_after_balance_size': None, # float Infinity 'keep_cross_validation_splits': None, # boolean false 'checkpoint': None, # Key None 'overwrite_with_best_model': None, # boolean true 'expert_mode': None, # boolean false 'autoencoder': None, # boolean false # 'use_all_factor_levels': None, # boolean true # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout'] 'activation': None, # enum Rectifier 'hidden': None, # int[] [200, 200] 'epochs': None, # double 10.0 'train_samples_per_iteration': None, # long -2 'target_ratio_comm_to_comp': None, # double 0.02 'seed': None, # long 1679194146842485659 'adaptive_rate': None, # boolean true 'rho': None, # double 0.99 'epsilon': None, # double 1.0E-8 'rate': None, # double 0.005 'rate_annealing': None, # double 1.0E-6 'rate_decay': None, # double 1.0 'momentum_start': None, # double 0.0 'momentum_ramp': None, # double 1000000.0 'momentum_stable': None, # double 0.0 'nesterov_accelerated_gradient': None, # boolean true 'input_dropout_ratio': None, # double 0.0 'hidden_dropout_ratios': None, # double[] None (this can grid?) 'l1': None, # double 0.0 'l2': None, # double 0.0 'max_w2': None, # float Infinity 'initial_weight_distribution': None, # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal'] 'initial_weight_scale': None, # double 1.0 'loss': None, # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy'] 'score_interval': None, # double 5.0 'score_training_samples': None, # long 10000 'score_validation_samples': None, # long 0 'score_duty_cycle': None, # double 0.1 'classification_stop': None, # double 0.0 'regression_stop': None, # double 1.0E-6 'quiet_mode': None, # boolean false 'max_confusion_matrix_size': None, # int 20 'max_hit_ratio_k': None, # int 10 'balance_classes': None, # boolean false 'class_sampling_factors': None, # float[] None 'max_after_balance_size': None, # float Infinity 'score_validation_sampling': None, # enum Uniform [u'Uniform', u'Stratified'] 'diagnostics': None, # boolean true 'variable_importances': None, # boolean false 'fast_mode': None, # boolean true 'ignore_const_cols': None, # boolean true 'force_load_balance': None, # boolean true 'replicate_training_data': None, # boolean false 'single_node_mode': None, # boolean false 'shuffle_training_data': None, # boolean false 'missing_values_handling': None, # enum MeanImputation [u'Skip', u'MeanImputation'] 'sparse': None, # boolean false 'col_major': None, # boolean false 'average_activation': None, # double 0.0 'sparsity_beta': None, # double 0.0 } model_key = 'benign_dl.hex' bmResult = h2o.n0.build_model(algo='deeplearning', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) print "bmResult:", dump_json(bmResult) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_billion_rows(self): # just do the import folder once timeoutSecs = 1500 csvFilenameAll = [ # quick test first # "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path='standard/' + csvFilename, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList parameters = { 'response_column': 1, 'n_folds': 0, 'alpha': 0, 'lambda': 0, } model_key = 'B.hex' bmResult = h2o.n0.build_model(algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() labelListUsed = labelList h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)
def test_GBM_basic(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' model_key = 'GBMModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=train_key, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': None, 'score_each_iteration': True, 'response_column': 'C55', 'do_classification': True, # 'balance_classes': # 'max_after_balance_size': 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? 'variable_importance': False, # 'seed': } model_key = 'covtype_gbm.hex' bmResult = h2o.n0.build_model(algo='gbm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_parse_rand_enum_compress(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 1000000 # from command line arg -long if h2o_args.long_test_case: repeat = 1000 scale = 10 # scale up the # of rows tryList = [ (n * scale, 1, 'cI', 300), (n * scale, 1, 'cI', 300), (n * scale, 1, 'cI', 300), ] else: repeat = 1 scale = 1 tryList = [ (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] lastcolsHistory = [] enumList = create_enum_list(listSize=ENUMS_NUM) for r in range(repeat): SEED_PER_FILE = random.randint(0, sys.maxint) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) # why are we saving this? lastcolsHistory.append(lastcols) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, check_header=0, timeoutSecs=30, separator=colSepInt, doSummary=DO_SUMMARY) parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) # optional. only needed to extract parse_key? pA = h2o_cmd.ParseObj(parseResultA, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) self.assertEqual(rowCount, iA.numRows) self.assertEqual(colCount, iA.numCols)
def test_w2v_basic_2(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 100 tryList = [ # (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) hex_key = "not_used.hex" # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, check_header=1, delete_on_done=0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList src_key = h2i.find_key('syn_.*csv') # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'minWordFreq': 1, # int 5 [] 'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram'] 'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 1, # int 5 [] 'vecSize': 10, # int 100 'windowSize': 2, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model(algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_summary2_NY0(self): SYNDATASETS_DIR = h2o.make_syn_dir() choicesList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), (' N', ' Y', ' 0'), (' n', ' y', ' 0'), (' F', ' T', ' 0'), (' f', ' t', ' 0'), ] # white space is stripped expectedList = [ ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ('N', 'Y', '0'), ('n', 'y', '0'), ('F', 'T', '0'), ('f', 't', '0'), ] tryList = [ # colname, (min, 25th, 50th, 75th, max) (100, 200, 'x.hex', choicesList[4], expectedList[4]), (100, 200, 'x.hex', choicesList[5], expectedList[5]), (100, 200, 'x.hex', choicesList[6], expectedList[6]), (100, 200, 'x.hex', choicesList[7], expectedList[7]), (100, 200, 'x.hex', choicesList[3], expectedList[3]), (1000, 200, 'x.hex', choicesList[2], expectedList[2]), (10000, 200, 'x.hex', choicesList[1], expectedList[1]), (100000, 200, 'x.hex', choicesList[0], expectedList[0]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, choices, expected) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols for i in range(colCount): # walks across the columns triggering a summary on the col desired # runSummary returns a column object now. inspect and parse don't. They return json. # maybe eventually will make them return object? But I also pass expected stuff to them # should I pass expected to summary? no, more complex? co = h2o_cmd.runSummary(key=hex_key, column=i) print co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) print "\nComparing column %s to expected" % i self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \ (i, expectedNaCnt[i], co.missing_count)) self.assertEqual(rowCount - expectedNaCnt[i], sum(co.histogram_bins)) h2p.green_print("\nDone with trial", trial) trial += 1 h2i.delete_keys_at_all_nodes()
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error ? supposedly fixed now # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxErr = ((expectedMax - expectedMin) / 20.0) / 2.0 # add 5% for fp errors? maxErr = 1.05 * maxErr expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows, pA.numCols, pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) print iA.missingList, iA.labelList, iA.numRows, iA.numCols # column 0 not used here assert len(expected) == 6 co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr) trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "maxErr", maxErr if co.label != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.99, h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only( pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE' } else: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1 } rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList numColsUsed = numCols labelListUsed = labelList ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? destination_key = 'syn_spheres100.hex' cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(2): parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'score_each_iteration': False, 'k': CLUSTERS, 'max_iterations': 50, 'standardize': False, # 'seed': kmeansSeed, 'init': 'Furthest', } timeoutSecs = 100 model_key = 'sphere100_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=timeoutSecs) modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) # no expected row/error? expected = [(None, c, None, None) for c in centersList] expected.sort(key=lambda tup: sum(tup[1])) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta=[.01, .01, .01]) print "Trial #", trial, "completed"
def test_DL_mnist(self): h2o.nodes[0].remove_all_keys() csvPathname_train = 'laptop/mnist/train.csv.gz' csvPathname_test = 'laptop/mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='bigdata', path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) numCols = iA.numCols labelList = iA.labelList parseResultV = h2i.import_parse(bucket='bigdata', path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=False) response = numCols - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' parameters = { 'validation_frame': validation_key, # KeyIndexed None 'ignored_columns': None, # string[] None 'response_column': labelList[response], # string None 'balance_classes': None, # boolean false 'max_after_balance_size': None, # float Infinity 'keep_cross_validation_splits': None, # boolean false 'checkpoint': None, # Key None 'overwrite_with_best_model': None, # boolean true 'expert_mode': None, # boolean false 'autoencoder': None, # boolean false 'use_all_factor_levels': None, # boolean true # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout'] 'activation': 'RectifierWithDropout', # enum Rectifier 'hidden': '[117,131,129]', # int[] [200, 200] 'epochs': 2.0, # double 10.0 'train_samples_per_iteration': None, # long -2 'target_ratio_comm_to_comp': None, # double 0.02 'seed': None, # long 1679194146842485659 'adaptive_rate': False, # boolean true 'rho': None, # double 0.99 'epsilon': None, # double 1.0E-8 'rate': None, # double 0.005 'rate_annealing': None, # double 1.0E-6 'rate_decay': None, # double 1.0 'momentum_start': 0.5, # double 0.0 'momentum_ramp': 100000, # double 1000000.0 'momentum_stable': 0.9, # double 0.0 'nesterov_accelerated_gradient': None, # boolean true 'input_dropout_ratio': 0.2, # double 0.0 'hidden_dropout_ratios': None, # double[] None (this can grid?) 'l1': 1e-5, # double 0.0 'l2': 1e-7, # double 0.0 'max_w2': 15, # float Infinity 'initial_weight_distribution': None, # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal'] 'initial_weight_scale': None, # double 1.0 'loss': 'CrossEntropy', # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy'] 'score_interval': None, # double 5.0 'score_training_samples': None, # long 10000 'score_validation_samples': None, # long 0 'score_duty_cycle': None, # double 0.1 'classification_stop': None, # double 0.0 'regression_stop': None, # double 1.0E-6 'quiet_mode': None, # boolean false 'max_confusion_matrix_size': None, # int 20 'max_hit_ratio_k': None, # int 10 'balance_classes': None, # boolean false 'class_sampling_factors': None, # float[] None 'max_after_balance_size': None, # float Infinity 'score_validation_sampling': None, # enum Uniform [u'Uniform', u'Stratified'] 'diagnostics': None, # boolean true 'variable_importances': None, # boolean false 'fast_mode': None, # boolean true 'ignore_const_cols': None, # boolean true 'force_load_balance': None, # boolean true 'replicate_training_data': None, # boolean false 'single_node_mode': None, # boolean false 'shuffle_training_data': None, # boolean false 'missing_values_handling': None, # enum MeanImputation [u'Skip', u'MeanImputation'] 'sparse': None, # boolean false 'col_major': None, # boolean false 'average_activation': None, # double 0.0 'sparsity_beta': None, # double 0.0 } expectedErr = 0.057 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 60 start = time.time() bmResult = h2o.n0.build_model(algo='deeplearning', model_id=model_key, training_frame=hex_key, parameters=parameters, timeoutSecs=timeoutSecs) bm = OutputObj(bmResult, 'bm') print 'deep learning took', time.time() - start, 'seconds' modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') # print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=validation_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() actualErr = model['errors']['valid_err'] print "expected classification error: " + format(expectedErr) print "actual classification error: " + format(actualErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GLM_covtype(self): importFolderPath = "standard" csvFilename = "covtype.data" hex_key = "covtype.hex" bucket = "home-0xdiag-datasets" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('C54') numColsUsed = numCols - 1 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': 'C54', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, # NPE with n_folds 2? 'n_folds': 1, } model_key = 'covtype_glm.hex' bmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1==0: print "" for i,c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_DL_airlines_small(self): h2o.nodes[0].remove_all_keys() csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'train.hex' validation_key = 'validation.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=False) pAV = h2o_cmd.ParseObj(parseResultV) iAV = h2o_cmd.InspectObj(pAV.parse_key) #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' parameters = { 'validation_frame': validation_key, # KeyIndexed None 'ignored_columns': "['IsDepDelayed_REC']", # string[] None 'response_column': 'IsDepDelayed', # string None 'loss': 'CrossEntropy' } expectedErr = 0.32 ## expected validation error for the above model relTol = 0.15 ## 15% rel. error tolerance due to Hogwild! timeoutSecs = 60 start = time.time() bmResult = h2o.n0.build_model(algo='deeplearning', model_id=model_key, training_frame=hex_key, parameters=parameters, timeoutSecs=timeoutSecs) bm = OutputObj(bmResult, 'bm') print 'deep learning took', time.time() - start, 'seconds' modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') # print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=validation_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() actualErr = model['errors']['valid_err'] print "expected classification error: " + format(expectedErr) print "actual classification error: " + format(actualErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GBM_airlines(self): files = [ ('datasets', 'airlines_all.05p.csv', 'airlines_all.05p.hex', 1800, 'IsDepDelayed'), # ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename model_key = 'GBMModelKey' # IsDepDelayed might already be enum, but just to be sure parseResult = h2i.import_parse( path=csvPathname, schema='hdfs', hex_key=trainKey, columnTypeDict={'IsDepDelayed': 'Enum'}, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': trainKey, # 'ignored_columns': '[CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed]', 'response_column': response, # 'balance_classes': # 'max_after_balance_size': 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, # 'loss': 'multinomial', # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': } bmResult = h2o.n0.build_model(algo='gbm', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=360) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')