def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        print "\nParsing", csvPathname
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        # need more info about the dataset for debug
        h2o_cmd.info_from_inspect(inspect, csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 6, 
                'num_cross_validation_folds': 3, 

                'family': "binomial", 
                'case_mode': ['>'],
                'case': ['20'],

                'alpha': 0, 
                'lambda': 0, 
                'beta_epsilon': 0.001, 
                'max_iter': 8
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 120 + (kwargs['num_cross_validation_folds']*30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs)
            elapsed = time.time()-start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Exemple #2
0
 def test_mixed_causes_NA(self):
     csvFilename = 'mixed_causes_NA.csv'
     csvPathname = h2o.find_file('smalldata/' + csvFilename)
     parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15)
     inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
     sum_num_missing_values = h2o_cmd.info_from_inspect(inspect, csvPathname)
     self.assertEqual(sum_num_missing_values, 0,
             "Single column of data with mixed number/string should not have NAs")
    def test_rf_covtype_train_oobe(self):
        if (1==0):
            csvFilename = 'train.csv'
            csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename)
            print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, 
                timeoutSecs=180)
            # FIX! maybe try specifying column header with column name
            ### kwargs['response_variable'] = A55
        else:
            csvFilename = 'covtype.data'
            print "\nUsing header=0 on the normal covtype.data"
            csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data')
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=0, 
                timeoutSecs=180)


        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2o_cmd.info_from_inspect(inspect, csvPathname)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix']['classification_error']
            self.assertGreater(classification_error, 0.01, 
                "train.csv should have out of bag error estimate greater than 0.01")

            print "Trial #", trial, "completed"