def test_frame_split(self): csvFilename = 'iris.csv' csvPathname = 'iris/' + csvFilename hex_key = "iris.hex" parseResultA = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, timeoutSecs=10) print "Just split away and see if anything blows up" splitMe = hex_key pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key print "Just split away and see if anything blows up" splitMe = hex_key iA = h2o_cmd.InspectObj(splitMe) origNumRows = iA.numRows origNumCols = iA.numCols for s in range(10): iA = h2o_cmd.InspectObj(splitMe) numRows = iA.numRows fsResult = h2o.n0.frame_split(training_frame=splitMe, ratios='[0.5]') fs = OutputObj(fsResult, 'frame_split') model_key = fs.jobs[0].dest.name modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'frame_split') # print "model:", dump_json(model) split_keys = [split._key.name for split in model.splits] iB = h2o_cmd.InspectObj(split_keys[0]) iC = h2o_cmd.InspectObj(split_keys[1]) numCols = iB.numCols split0_rows = iB.numRows split1_rows = iC.numRows # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split_keys[1] # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols)
def test_split_frame(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResultA = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key print "Just split away and see if anything blows up" splitMe = hex_key iA = h2o_cmd.InspectObj(splitMe) origNumRows = iA.numRows origNumCols = iA.numCols for s in range(20): iA = h2o_cmd.InspectObj(splitMe) numRows = iA.numRows fsResult = h2o.n0.split_frame(dataset=splitMe, ratios='[0.5]') fs = OutputObj(fsResult, 'split_frame') d = fs.jobs[0].destination_frames # modelResult = h2o.n0.models(key=model_key) # model = OutputObj(modelResult['models'][0]['output'], 'split_frame') # print "model:", dump_json(model) split_keys = [split.name for split in d] iB = h2o_cmd.InspectObj(split_keys[0]) iC = h2o_cmd.InspectObj(split_keys[1]) numCols = iB.numCols split0_rows = iB.numRows split1_rows = iC.numRows # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows splitMe = split_keys[1] # split should be within 1 row accuracy. let's say within 20 for now self.assertLess(abs(split1_rows - split0_rows), 2) self.assertEqual(numRows, (split1_rows + split0_rows)) self.assertEqual(numCols, origNumCols) if split1_rows <= 1: break
def test_GLM_covtype(self): importFolderPath = "standard" csvFilename = "covtype.data" hex_key = "covtype.hex" bucket = "home-0xdiag-datasets" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('C54') numColsUsed = numCols - 1 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': 'C54', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, # NPE with n_folds 2? 'n_folds': 1, } model_key = 'covtype_glm.hex' bmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1==0: print "" for i,c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_w2v_basic_2(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 100 tryList = [ # (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) hex_key = "not_used.hex" # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, check_header=1, delete_on_done=0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList src_key = h2i.find_key('syn_.*csv') # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'minWordFreq': 1, # int 5 [] 'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram'] 'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 1, # int 5 [] 'vecSize': 10, # int 100 'windowSize': 2, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model(algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_billion_rows(self): # just do the import folder once timeoutSecs = 1500 csvFilenameAll = [ # quick test first # "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path='standard/' + csvFilename, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList parameters = { 'response_column': 'C2', 'alpha': '[0]', 'lambda': '[0]', } model_key = 'B.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=300) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() labelListUsed = labelList h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)
def test_DL_mnist(self): h2o.nodes[0].remove_all_keys() csvPathname_train = 'laptop/mnist/train.csv.gz' csvPathname_test = 'laptop/mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='bigdata', path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) numCols = iA.numCols labelList = iA.labelList parseResultV = h2i.import_parse(bucket='bigdata', path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=False) response = numCols - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' parameters = { 'validation_frame': validation_key, # KeyIndexed None 'ignored_columns': None, # string[] None 'response_column': labelList[response], # string None 'balance_classes': None, # boolean false 'max_after_balance_size': None, # float Infinity 'keep_cross_validation_splits': None, # boolean false 'checkpoint': None, # Key None 'overwrite_with_best_model': None, # boolean true 'expert_mode': None, # boolean false 'autoencoder': None, # boolean false 'use_all_factor_levels': None, # boolean true # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout'] 'activation': 'RectifierWithDropout', # enum Rectifier 'hidden': '[117,131,129]', # int[] [200, 200] 'epochs': 2.0, # double 10.0 'train_samples_per_iteration': None, # long -2 'target_ratio_comm_to_comp': None, # double 0.02 'seed': None, # long 1679194146842485659 'adaptive_rate': False, # boolean true 'rho': None, # double 0.99 'epsilon': None, # double 1.0E-8 'rate': None, # double 0.005 'rate_annealing': None, # double 1.0E-6 'rate_decay': None, # double 1.0 'momentum_start': 0.5, # double 0.0 'momentum_ramp': 100000, # double 1000000.0 'momentum_stable': 0.9, # double 0.0 'nesterov_accelerated_gradient': None, # boolean true 'input_dropout_ratio': 0.2, # double 0.0 'hidden_dropout_ratios': None, # double[] None (this can grid?) 'l1': 1e-5, # double 0.0 'l2': 1e-7, # double 0.0 'max_w2': 15, # float Infinity 'initial_weight_distribution': None, # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal'] 'initial_weight_scale': None, # double 1.0 'loss': 'CrossEntropy', # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy'] 'score_interval': None, # double 5.0 'score_training_samples': None, # long 10000 'score_validation_samples': None, # long 0 'score_duty_cycle': None, # double 0.1 'classification_stop': None, # double 0.0 'regression_stop': None, # double 1.0E-6 'quiet_mode': None, # boolean false 'max_confusion_matrix_size': None, # int 20 'max_hit_ratio_k': None, # int 10 'balance_classes': None, # boolean false 'class_sampling_factors': None, # float[] None 'max_after_balance_size': None, # float Infinity 'score_validation_sampling': None, # enum Uniform [u'Uniform', u'Stratified'] 'diagnostics': None, # boolean true 'variable_importances': None, # boolean false 'fast_mode': None, # boolean true 'ignore_const_cols': None, # boolean true 'force_load_balance': None, # boolean true 'replicate_training_data': None, # boolean false 'single_node_mode': None, # boolean false 'shuffle_training_data': None, # boolean false 'missing_values_handling': None, # enum MeanImputation [u'Skip', u'MeanImputation'] 'sparse': None, # boolean false 'col_major': None, # boolean false 'average_activation': None, # double 0.0 'sparsity_beta': None, # double 0.0 } expectedErr = 0.057 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 60 start = time.time() bmResult = h2o.n0.build_model(algo='deeplearning', model_id=model_key, training_frame=hex_key, parameters=parameters, timeoutSecs=timeoutSecs) bm = OutputObj(bmResult, 'bm') print 'deep learning took', time.time() - start, 'seconds' modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') # print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=validation_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() actualErr = model['errors']['valid_err'] print "expected classification error: " + format(expectedErr) print "actual classification error: " + format(actualErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GLM_basic_1(self): importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # loop, to see if we get same centers labelListUsed = list(labelList) labelListUsed.remove('STR') labelListUsed.remove('FNDX') # response removed also numColsUsed = numCols - 2 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? # glm parameters: # model_id Key<Model> False None [] # training_frame Key<Frame> False None [] # validation_frame Key<Frame> False None [] # ignored_columns string[] False None [] # drop_na20_cols boolean False False [] # score_each_iteration boolean False False [] # response_column VecSpecifier False None [] # balance_classes boolean False False [] # class_sampling_factors float[] False None [] # max_after_balance_size float False 5.0 [] # max_confusion_matrix_size int False 20 [] # max_hit_ratio_k int False 10 [] # family enum False gaussian [u'gaussian', u'binomial', u'poisson', u'gamma'] # solver enum False IRLSM [u'AUTO', u'IRLSM', u'L_BFGS'] # alpha double[] False None [] # lambda double[] False None [] # lambda_search boolean False False [] # lambda_min_ratio double False -1.0 [] # nlambdas int False -1 [] # standardize boolean False True [] # max_iterations int False -1 [] # beta_epsilon double False 0.0001 [] # link enum False family_default [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # prior double False -1.0 [] # use_all_factor_levels boolean False False [] # beta_constraints Key<Frame> False None [] # max_active_predictors int False -1 [] parameters = { 'ignored_columns': '["STR"]', 'response_column': 'FNDX', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'alpha': '[1e-4]', 'lambda': '[0.5]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, # 'use_all_factor_levels': False, } model_key = 'benign_glm.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') if 1 == 0: cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') print "" for i, c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_bayes_basic(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' b = Key(train_key) model_key = 'bayesModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename # FIX! do I need to force enum for classification? what if I do regression after this? columnTypeDict = {54: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, schema='local', chunk_size=4194304, hex_key=train_key, timeoutSecs=timeoutSecs) # don't have to make it enum, if 0/1 (can't operate on enums like this) # make 1-7 go to 0-6. 0 isn't there. # make 1 thru 6 go to 1 # change columnTypeDict to None above if I do this # Assign(b[:,54], b[:,54]-1) # Assign(b[:,54], b[:,54]!=0) # now we have just 0 and 1 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols # run through a couple of parameter sets parameters = [] parameters.append({ 'response_column': 'C55', # still 1-55 on colnames }) # just default model_key = 'covtype_bayes.hex' for p in parameters: bmResult = h2o.n0.build_model(algo='naivebayes', destination_key=model_key, training_frame=train_key, validation_frame=train_key, parameters=p, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GLM_error1(self): importFolderPath = "covtype" csvFilename = "covtype.20k.data" hex_key = "covtype20k.hex" binomial_key = "covtype20k.b.hex" b = Key(hex_key) csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) ## columnTypeDict = {54: 'Enum'} columnTypeDict = None parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=binomial_key, columnTypeDict=columnTypeDict, check_header=1, timeoutSecs=180, doSummary=False) # don't have to make it enum, if 0/1 (can't operate on enums like this) # make 1-7 go to 0-6. 0 isn't there. Assign(b[:, 54], b[:, 54] - 1) # make 1 thru 6 go to 1 Assign(b[:, 54], b[:, 54] != 0) # now we have just 0 and 1 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # loop, to see if we get same centers labelListUsed = list(labelList) numColsUsed = numCols for trial in range(5): parameters = { 'response_column': 'C55', 'max_iterations': 3, 'solver': 'L_BFGS', 'ignored_columns': '["C1"]', 'alpha': '[0.1]', 'max_after_balance_size': 1000.0, 'class_sampling_factors': '[0.2]', # 'use_all_factor_levels': None, 'lambda': '[0]', } bHack = hex_key co = h2o_cmd.runSummary(key=binomial_key, column=54) print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) co = h2o_cmd.runSummary(key=hex_key, column=54) print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) model_key = 'rand_glm.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=bHack, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowNaN=True) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # FIX! when is this legal doClassification = False if doClassification: mcms = OutputObj( {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) if doClassification: thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1 == 0: print "" for i, c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_kmeans_prostate(self): importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList # loop, to see if we get same centers expected = [ (None, [0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955), (None, [0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045), (None, [0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.02, 0.02, 0.02) labelListUsed = list(labelList) labelListUsed.remove('ID') numColsUsed = numCols - 1 for trial in range(5): # kmeansSeed = random.randint(0, sys.maxint) # actually can get a slightly better error sum with a different seed # this seed gets the same result as scikit (at least in h2o1) # kmeansSeed = 6655548259421773879 kmeansSeed = 7037878434240420762 parameters = { 'validation_frame': parse_key, 'ignored_columns': "['ID']", 'k': 3, 'max_iterations': 500, 'standardize': False, 'seed': kmeansSeed, # PlusPlus init seems bad here..should investigate 'init': 'Furthest', } model_key = 'prostate_k.hex' bmResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5 * ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5 * ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1 * ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1 * ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1 * ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1 * ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1 * ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]), (1 * ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1 * ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) numRows = pA.numRows numCols = pA.numCols parse_key = pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[]) #*************************** # Summary co = h2o_cmd.runSummary(key=parse_key) default_pctiles = co.default_pctiles coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals( co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str, probsList)) parameters = { 'model_id': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model(algo='quantile', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iterations']", model.output['iterations'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][ 0] # why is this a double array iterations = model.output['iterations'] assert iterations == 11, iterations print "quantiles: ", quantiles print "iterations: ", iterations # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname != '': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()
def test_GLM_basic_2(self): importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('ID') labelListUsed.remove('CAPSULE') numColsUsed = numCols - 2 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': '[ID]', 'score_each_iteration': True, 'response_column': 'CAPSULE', # FIX! when is this needed? redundant for binomial? 'do_classification': True, 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'higher_accuracy': True, 'use_all_factor_levels': False, # NPE with n_folds 2? 'n_folds': 1, } model_key = 'prostate_glm.hex' bmResult = h2o.n0.build_model(algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print(rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # PCA(tolerance iterate)**************************************** for tolerance in [i / 10.0 for i in range(11)]: parameters = { # 'tolerance': tolerance, # 'standardize': 1, 'k': 1, } model_key = 'pca.hex' bmResult = h2o.n0.build_model(algo='pca', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_GBM_basic(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' model_key = 'GBMModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename # FIX! do I need to force enum for classification? what if I do regression after this? columnTypeDict = {54: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, schema='local', chunk_size=4194304, hex_key=train_key, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols # run through a couple of parameter sets parameters = [] parameters.append({ 'response_column': 'C55', 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, 'loss': 'multinomial', # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': }) parameters.append({ 'response_column': 'C55', 'loss': 'multinomial', # This does nothing! intent is solely based on type of response col 'ntrees': 1, 'max_depth': 20, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, }) model_key = 'covtype_gbm.hex' for p in parameters: bmResult = h2o.n0.build_model( algo='gbm', model_id=model_key, training_frame=train_key, validation_frame=train_key, parameters=p, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n" vis = OutputObj(model.variable_importances, 'vis') # just the first 10 visDataChopped = [v[0:9] for v in vis.data] names = visDataChopped[0] relativeImportance = visDataChopped[1] print "names:", names print "relativeImportance:", relativeImportance scaledImportance = visDataChopped[2] percentage = visDataChopped[3] print "\nvis\n", tabulate(visDataChopped[1:], headers=names) # print "\nrelativeImportance (10)\n", tabulate(relativeImportance, headers=names) # print "\nscaledImportance (10)\n", tabulate(scaledImportance, headers=names) # print "\npercentage (10)\n", tabulate(percentage, headers=names) print "will say Regression or Classification. no Multinomial?" print "model.model_category", model.model_category assert model.model_category=='Multinomial', model.model_category print "FIX! why is mse 0 and mes_train Nan?" print "model.mse:", model.mse print "model.mse_train:", model.mse_train if 1==1: print "" for i,c in enumerate(cmm.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GBM_airlines(self): files = [ ('datasets', 'airlines_all.05p.csv', 'airlines_all.05p.hex', 1800, 'IsDepDelayed'), # ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename model_key = 'GBMModelKey' # IsDepDelayed might already be enum, but just to be sure parseResult = h2i.import_parse( path=csvPathname, schema='hdfs', hex_key=trainKey, columnTypeDict={'IsDepDelayed': 'Enum'}, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': trainKey, # 'ignored_columns': '[CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed]', 'response_column': response, # 'balance_classes': # 'max_after_balance_size': 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, # 'loss': 'multinomial', # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': } bmResult = h2o.n0.build_model(algo='gbm', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=360) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GLM_many_cols_4(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (100000, 10, 'cA', 600), (100000, 100, 'cA', 600), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 labelListUsed = list(labelList) print "labelListUsed", labelListUsed response = labelListUsed[-1] labelListUsed.remove(response) numColsUsed = numCols - 1 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': response, # can't take index now? # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, 'n_folds': 1, } model_key = 'many_cols_glm.hex' bmResult = h2o.n0.build_model(algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def notest_kmeans_benign(self): importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [ (None, [8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), (None, [33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), (None, [27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), (None, [26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(5): kmeansSeed = random.randint(0, sys.maxint) # kmeansSeed = 6655548259421773879 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'k': 4, 'max_iterations': 50, 'standardize': False, 'seed': kmeansSeed, 'init': 'Furthest', } model_key = 'benign_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) modelResult = h2o.n0.models(key=model_key) km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed) # zip with * is it's own inverse here. It's sorted by centers for easy comparisons # changed..old order: ids, mses, rows, centers = zip(*km.tuplesSorted) # new order: # ids, centers, rows, errors = zip(*km.tuplesSorted) # create a tuple for each cluster, then sort by row # old. this was going to do a predict and a summary (histogram) (old h2o1 needed this for more info) # (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeansResult, csvPathname, parseResult, 'd', parameters) h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta) # Not seeing any scoring results yet? cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename # we can force a col type to enum now? with param columnTypes # "Numeric" # make the last column enum # Instead of string for parse, make this a dictionary, with column index, value # that's used for updating the ColumnTypes array before making it a string for parse columnTypeDict = {10: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(10): print "Summary on column", i # FIX! how come only 0 works here for column co = h2o_cmd.runSummary(key=parse_key, column=i) for k,v in co: print k, v expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('C11') numColsUsed = numCols - 1 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': 'C11', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, 'n_folds': 1, } start = time.time() model_key = 'hastie_glm.hex' bmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') # compare this glm to the first one. since the files are replications, the results # should be similar? if self.validation1: h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1) else: # self.validation1 = copy.deepcopy(validation) self.validation1 = None
def test_GLM_airlines(self): files = [ # ('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ('airlines', 'year2013.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename model_key = 'GLMModelKey' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': parse_key, 'ignored_columns': '[CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed]', 'response_column': response, # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[0]', 'lambda': '[0.5]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, # NPE with n_folds 2? 'n_folds': 1, } model_key = 'airlines_glm.hex' bmResult = h2o.n0.build_model(algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=300) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') t0 = len(model.coefficients_table.data[0]) t1 = len(model.coefficients_table.data[1]) # not sure what the exact number should be, but it's gotta be less than the cols in the dataset? # Whoa! forgot GLM expands enums to individiual coefficients. Would really need to look at all the domains and sum plus other cols? # assert t0 <= numColsUsed, "%s %s" % (t0, numColsUsed) # assert t1 <= numColsUsed, "%s %s" % (t1, numColsUsed) h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1 == 0: print "" for i, c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_w2v_basic_1(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 500000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, checkHeader=1, delete_on_done=0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) cA = h2o_test.OutputObj(iA.columns[0], "inspect_column") parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(colCount): print cA.type, cA.missing self.assertEqual( 0, cA.missing, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, cA.missing)) self.assertEqual( 'string', cA.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, cA.type)) if DO_SUMMARY: for i in range(colCount): co = h2o_cmd.runSummary(key=parse_key, column=i) print co.label, co.type, co.missing, co.domain, sum( co.bins) self.assertEqual( 0, co.missing, "Column %s Expected %s. missing: %s is incorrect" % (i, 0, co.missing)) self.assertEqual( 'String', co.type, "Column %s Expected %s. type: %s is incorrect" % (i, 0, co.type)) # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'score_each_iteration': None, # boolean false [] 'minWordFreq': 5, # int 5 [] 'wordModel': 'SkipGram', # enum [u'CBOW', u'SkipGram'] 'normModel': 'HSM', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 5, # int 5 [] 'vecSize': 100, # int 100 'windowSize': 5, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model(algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') # not implemented? # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_GLM_params_rand2(self): importFolderPath = "covtype" csvFilename = "covtype.20k.data" hex_key = "covtype20k.hex" binomial_key = "covtype20k.b.hex" b = Key(hex_key) csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) ## columnTypeDict = {54: 'Enum'} columnTypeDict = None parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=binomial_key, columnTypeDict=columnTypeDict, check_header=1, timeoutSecs=180, doSummary=False) # don't have to make it enum, if 0/1 (can't operate on enums like this) # make 1-7 go to 0-6. 0 isn't there. Assign(b[:, 54], b[:, 54] - 1) # make 1 thru 6 go to 1 Assign(b[:, 54], b[:, 54] != 0) # now we have just 0 and 1 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # loop, to see if we get same centers labelListUsed = list(labelList) numColsUsed = numCols paramDict = define_params() for trial in range(5): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? # params is mutable. This is default. parameters = { 'response_column': 'C55', 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, } h2o_glm.pickRandGlmParams(paramDict, parameters) if 'family' not in parameters or parameters['family'] == 'binomial': bHack = binomial_key else: bHack = hex_key co = h2o_cmd.runSummary(key=binomial_key, column=54) print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) co = h2o_cmd.runSummary(key=hex_key, column=54) print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) # fix stupid params fixList = [ 'alpha', 'lambda', 'ignored_columns', 'class_sampling_factors' ] for f in fixList: if f in parameters: parameters[f] = "[%s]" % parameters[f] model_key = 'rand_glm.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=bHack, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowNaN=True) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # FIX! when is this legal doClassification = False if doClassification: mcms = OutputObj( {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) if doClassification: thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1 == 0: print "" for i, c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GBM_basic(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' model_key = 'GBMModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=train_key, timeoutSecs=timeoutSecs) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': None, 'score_each_iteration': True, 'response_column': 'C55', 'do_classification': True, # 'balance_classes': # 'max_after_balance_size': 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? 'variable_importance': False, # 'seed': } model_key = 'covtype_gbm.hex' bmResult = h2o.n0.build_model(algo='gbm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_DL_basic(self): h2o.nodes[0].remove_all_keys() importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # no cols ignored labelListUsed = list(labelList) labelListUsed.remove('STR') numColsUsed = numCols - 1 for trial in range(1): parameters = { # required now # loss enum True None [u'MeanSquare', u'CrossEntropy'] 'loss': 'CrossEntropy', 'validation_frame': parse_key, # KeyIndexed None 'ignored_columns': '["STR"]', # string[] None 'response_column': 'FNDX', # string None 'balance_classes': None, # boolean false 'max_after_balance_size': None, # float Infinity 'keep_cross_validation_splits': None, # boolean false 'checkpoint': None, # Key None 'overwrite_with_best_model': None, # boolean true 'expert_mode': None, # boolean false 'autoencoder': None, # boolean false # 'use_all_factor_levels': None, # boolean true # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout'] 'activation': None, # enum Rectifier 'hidden': None, # int[] [200, 200] 'epochs': None, # double 10.0 'train_samples_per_iteration': None, # long -2 'target_ratio_comm_to_comp': None, # double 0.02 'seed': None, # long 1679194146842485659 'adaptive_rate': None, # boolean true 'rho': None, # double 0.99 'epsilon': None, # double 1.0E-8 'rate': None, # double 0.005 'rate_annealing': None, # double 1.0E-6 'rate_decay': None, # double 1.0 'momentum_start': None, # double 0.0 'momentum_ramp': None, # double 1000000.0 'momentum_stable': None, # double 0.0 'nesterov_accelerated_gradient': None, # boolean true 'input_dropout_ratio': None, # double 0.0 'hidden_dropout_ratios': None, # double[] None (this can grid?) 'l1': None, # double 0.0 'l2': None, # double 0.0 'max_w2': None, # float Infinity 'initial_weight_distribution': None, # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal'] 'initial_weight_scale': None, # double 1.0 'loss': None, # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy'] 'score_interval': None, # double 5.0 'score_training_samples': None, # long 10000 'score_validation_samples': None, # long 0 'score_duty_cycle': None, # double 0.1 'classification_stop': None, # double 0.0 'regression_stop': None, # double 1.0E-6 'quiet_mode': None, # boolean false 'max_confusion_matrix_size': None, # int 20 'max_hit_ratio_k': None, # int 10 'balance_classes': None, # boolean false 'class_sampling_factors': None, # float[] None 'max_after_balance_size': None, # float Infinity 'score_validation_sampling': None, # enum Uniform [u'Uniform', u'Stratified'] 'diagnostics': None, # boolean true 'variable_importances': None, # boolean false 'fast_mode': None, # boolean true 'ignore_const_cols': None, # boolean true 'force_load_balance': None, # boolean true 'replicate_training_data': None, # boolean false 'single_node_mode': None, # boolean false 'shuffle_training_data': None, # boolean false 'missing_values_handling': None, # enum MeanImputation [u'Skip', u'MeanImputation'] 'sparse': None, # boolean false 'col_major': None, # boolean false 'average_activation': None, # double 0.0 'sparsity_beta': None, # double 0.0 } model_key = 'benign_dl.hex' bmResult = h2o.n0.build_model(algo='deeplearning', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) print "bmResult:", dump_json(bmResult) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_DL_airlines_small(self): h2o.nodes[0].remove_all_keys() csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'train.hex' validation_key = 'validation.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=False) pAV = h2o_cmd.ParseObj(parseResultV) iAV = h2o_cmd.InspectObj(pAV.parse_key) #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' parameters = { 'validation_frame': validation_key, # KeyIndexed None 'ignored_columns': "['IsDepDelayed_REC']", # string[] None 'response_column': 'IsDepDelayed', # string None 'loss': 'CrossEntropy' } expectedErr = 0.32 ## expected validation error for the above model relTol = 0.15 ## 15% rel. error tolerance due to Hogwild! timeoutSecs = 60 start = time.time() bmResult = h2o.n0.build_model(algo='deeplearning', model_id=model_key, training_frame=hex_key, parameters=parameters, timeoutSecs=timeoutSecs) bm = OutputObj(bmResult, 'bm') print 'deep learning took', time.time() - start, 'seconds' modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') # print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=validation_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() actualErr = model['errors']['valid_err'] print "expected classification error: " + format(expectedErr) print "actual classification error: " + format(actualErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_GBMGrid_basic_many(self): trainFilename = 'prostate.csv' train_key = 'prostate.hex' timeoutSecs = 300 csvPathname = "logreg/" + trainFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=train_key, schema='put') pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': "['ID']", # this has to have [] 'response_column': 'CAPSULE', # 'balance_classes': # 'max_after_balance_size': # ?? # 'ntrees': '[8, 10]', 'ntrees': 8, # 'max_depth': '[8, 9]', 'max_depth': 8, # ?? # 'min_rows': '[1, 2]', 'min_rows': 1, 'nbins': 40, # ?? # 'learn_rate': "[0.1, 0.2]", 'learn_rate': 0.1, # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? # 'variable_importance': False, # 'seed': } jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 for i in range(5): modelKey = 'GBMGrid_prostate_%s', i bmResult = h2o.n0.build_model( algo='gbm', destination_key=modelKey, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') print "GBMResult:", h2o.dump_json(bm) # FIX! is this right for gridded? job_key = bm.jobs[0].key.name # FIX! this isn't a full formed name (%) model_key = bm.jobs[0].dest.name jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs for job_key, model_key in jobs: modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mmResultShort = mmResult['model_metrics'][0] del mmResultShort['frame'] # too much! mm = OutputObj(mmResultShort, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')