def test_loop_random_param_covtype(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') print "\nParsing", csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug h2o_cmd.info_from_inspect(inspect, csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 6, 'num_cross_validation_folds': 3, 'family': "binomial", 'case_mode': ['>'], 'case': ['20'], 'alpha': 0, 'lambda': 0, 'beta_epsilon': 0.001, 'max_iter': 8 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 120 + (kwargs['num_cross_validation_folds']*30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time()-start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_mixed_causes_NA(self): csvFilename = 'mixed_causes_NA.csv' csvPathname = h2o.find_file('smalldata/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) sum_num_missing_values = h2o_cmd.info_from_inspect(inspect, csvPathname) self.assertEqual(sum_num_missing_values, 0, "Single column of data with mixed number/string should not have NAs")
def test_rf_covtype_train_oobe(self): if (1==0): csvFilename = 'train.csv' csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename) print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, timeoutSecs=180) # FIX! maybe try specifying column header with column name ### kwargs['response_variable'] = A55 else: csvFilename = 'covtype.data' print "\nUsing header=0 on the normal covtype.data" csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.info_from_inspect(inspect, csvPathname) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix']['classification_error'] self.assertGreater(classification_error, 0.01, "train.csv should have out of bag error estimate greater than 0.01") print "Trial #", trial, "completed"