Ejemplo n.º 1
0
    def test_NOPASS_GLM2_tweedie_rand2(self):
        h2o.beta_features = True
        if 1==1:
            csvPathname = 'standard/covtype.data'
            hex_key = 'covtype.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='put')
        else:
            csvPathname = 'covtype/covtype.20k.data'
            hex_key = 'covtype.20k.hex'
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')


        paramDict = define_params()

        for trial in range(10):
            # params is mutable. This is default.
            params = {
                'response': 54, 
                'lambda': 0, 
                'alpha': 0, 
                'n_folds': 1, 
                'family': 'tweedie'
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Ejemplo n.º 2
0
    def test_GLM_tweedie_rand2(self):
        if 1 == 1:
            csvPathname = "standard/covtype.data"
            hex_key = "covtype.hex"
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, schema="put"
            )
        else:
            csvPathname = "covtype/covtype.20k.data"
            hex_key = "covtype.20k.hex"
            parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=hex_key, schema="put")

        paramDict = define_params()

        for trial in range(10):
            # params is mutable. This is default.
            params = {"y": 54, "case": 4, "case_mode": "=", "lambda": 0, "alpha": 0, "n_folds": 1, "family": "tweedie"}
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"
            print "Trial #", trial, "completed\n"
Ejemplo n.º 3
0
    def test_GLM2_airline(self):
        #############Train###############################
        csvFilename = 'AirlinesTrain.csv.zip'
        csvPathname = 'airlines'+'/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15)
        params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'}
        kwargs = params.copy()
        starttime = time.time()
        glmtest = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs)
        elapsedtime = time.time() - starttime 
        print("ELAPSED TIME TRAIN DATA ",elapsedtime)
        h2o_glm.simpleCheckGLM(self, glmtest, None, **kwargs)

      

        ######### Test ######################################
        csvFilename = 'AirlinesTest.csv.zip'
        csvPathname = 'airlines'+'/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15)
        params = {'response': 'IsDepDelayed', 'ignored_cols': 'IsDepDelayed_REC', 'family': 'binomial'}
        kwargs = params.copy()
        starttime = time.time()
        glmtrain = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs)
        elapsedtime = time.time() - starttime
        print("ELAPSED TIME TEST DATA ",elapsedtime)
        h2o_glm.simpleCheckGLM(self, glmtrain, None, **kwargs)
Ejemplo n.º 4
0
 def test_prostate_poisson(self):
     errors = []
     # First try on small data (1 chunk)
     parseResult = h2i.import_parse(
         bucket="smalldata", path="logreg/prostate.csv", schema="put", hex_key="poisson_p"
     )
     # R results
     poisson_coefficients = {
         "Intercept": -4.107484,
         "ID": 0.000508,
         "AGE": -0.004357,
         "RACE": -0.149412,
         "DPROS": 0.230458,
         "DCAPS": 0.071546,
         "PSA": 0.002944,
         "VOL": -0.007488,
         "GLEASON": 0.441659,
     }
     poisson_nd = 278.4
     poisson_rd = 215.7
     poisson_aic = 539.7
     errors = self.process_dataset(
         parseResult, "CAPSULE", poisson_coefficients, poisson_nd, poisson_rd, poisson_aic, family="poisson"
     )
     if errors:
         self.fail(str(errors))
     # Now try on larger data (replicated), will be chunked this time, should produce same results
     parseResult = h2i.import_parse(
         bucket="smalldata", path="logreg/prostate_long.csv.gz", schema="put", hex_key="poisson_long_p"
     )
     errors = self.process_dataset(
         parseResult, "CAPSULE", poisson_coefficients, poisson_nd, poisson_rd, poisson_aic, family="poisson"
     )
     if errors:
         self.fail(str(errors))
Ejemplo n.º 5
0
 def test_prostate_binomial(self):
     errors = []
     # First try on small data (1 chunk)
     parseResult = h2i.import_parse(
         bucket="smalldata", path="logreg/prostate.csv", schema="put", hex_key="prostate_b"
     )
     # R results
     binomial_coefficients = {
         "Intercept": -8.126278,
         "ID": 0.001609,
         "AGE": -0.008138,
         "RACE": -0.617597,
         "DPROS": 0.553065,
         "DCAPS": 0.546087,
         "PSA": 0.027297,
         "VOL": -0.011540,
         "GLEASON": 1.010125,
     }
     binomial_nd = 512.3
     binomial_rd = 376.9
     binomial_aic = 394.9
     errors = self.process_dataset(
         parseResult, "CAPSULE", binomial_coefficients, binomial_nd, binomial_rd, binomial_aic, family="binomial"
     )
     if errors:
         self.fail(str(errors))
     # Now try on larger data (replicated), will be chunked this time, should produce same results
     parseResult = h2i.import_parse(
         bucket="smalldata", path="logreg/prostate_long.csv.gz", schema="put", hex_key="prostate_long_b"
     )
     errors = self.process_dataset(
         parseResult, "CAPSULE", binomial_coefficients, binomial_nd, binomial_rd, binomial_aic, family="binomial"
     )
     if errors:
         self.fail(str(errors))
Ejemplo n.º 6
0
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult  = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)

        csvPathname_test  = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
Ejemplo n.º 7
0
    def test_GLM2grid_covtype_many(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'response': y,
            'family': 'gaussian',
            'n_folds': 2,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)

            # print "glmResult:", h2o.dump_json(glmResult)
            # assuming it doesn't complete right away, this is the first response
            # it differs for the last response
            job_key = glmResult['job_key']
            grid_key = glmResult['destination_key']
            jobs.append( (job_key, grid_key) )
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(4):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', 
                src_key=src_key, hex_key=hex_key, 
                timeoutSecs=10, noPoll=True, doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        # 2/GLMGridView.html?grid_key=asd
        # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN
        # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1
        for job_key, grid_key in jobs:
            gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key)
            h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs
Ejemplo n.º 8
0
    def test_exec2_cbind_like_R(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()


        SEEDPERFILE = random.randint(0, sys.maxint)
        rowCount = 30000
        colCount = 150
        timeoutSecs = 60
        hex_key = "df"
        csvPathname = SYNDATASETS_DIR + "/" + "df.csv"
        write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
        parseResult = h2i.import_parse(path=csvPathname, schema='put', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)

        colCount = 1
        hex_key = "indx"
        csvPathname = SYNDATASETS_DIR + "/" + "indx.csv"
        write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

        parseResult = h2i.import_parse(path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)

        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']

        for trial in range(10):
            for execExpr in exprList:
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'

        h2o.check_sandbox_for_errors()
Ejemplo n.º 9
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
Ejemplo n.º 10
0
    def test_GLM_mnist_s3n_fvec(self):
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_testing.csv.gz",  "mnist_training.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_training.csv.gz",    600), 
        ]

        importFolderPath = "mnist"
        csvPathname = importFolderPath + "/*"
        (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            # PARSE test****************************************
            csvPathname = importFolderPath + "/" + testCsvFilename
            testHexKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + trainCsvFilename
            trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # GLM****************************************
            y = 0 # first column is pixel value
            print "y:"
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x

            kwargs = {
                'response': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'gaussian',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "GLM completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Ejemplo n.º 11
0
    def notest_C_Basic(self):
        # this will do an import folder and parse. schema='local' is default. doesn't need to be specified
        # I guess this will be relative to current wd

        ## if os env variable H2O_BUCKETS_ROOT is set, it will start looking there for bucket, then path
        ## that covers the case where "walking upward" is not sufficient for where you but the bucket (locally)
        os.environ['H2O_BUCKETS_ROOT'] = '/home'
        h2i.import_parse(path='dir3/syn_sphere_gen3.csv', bucket='my-bucket3', schema='local')
        del os.environ['H2O_BUCKETS_ROOT'] 
Ejemplo n.º 12
0
    def test3(self):
        # h2i.import_parse(path='standard/covtype.data', bucket='home-0xdiag-datasets', schema="s3n", timeoutSecs=60)
        ## This will get it from import hdfs with s3n. the hdfs_name_node and hdfs_version for s3 
        # will have been passed at build_cloud, either from the test, or the <config>.json
        h2i.import_parse(path='standard/benign.csv', bucket='home-0xdiag-datasets', schema='s3n', timeoutSecs=60)

        # h2i.import_parse(path='leads.csv', bucket='datasets', schema="hdfs", timeoutSecs=60)
        # h2i.import_parse(path='/datasets/leads.csv', schema="hdfs", timeoutSecs=60)
        # h2i.import_parse(path='datasets/leads.csv', schema="hdfs", timeoutSecs=60)

        ## This will get it from import s3.
        h2i.import_parse(path='standard/benign.csv', bucket='home-0xdiag-datasets', schema='s3', timeoutSecs=60)
Ejemplo n.º 13
0
    def test_rf_iris(self):
        # Train RF
        trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put')
        kwargs = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put')
        kwargs = paramsTestRF.copy()
        scoreResult  = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
Ejemplo n.º 14
0
    def test_GBMScore(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvTrainPath = importFolderPath + "/allyears2k.csv"
        csvTestPath = csvTrainPath
        #        importFolderPath = 'newairlines'
        #        csvTrainPath = importFolderPath + '/train/*train*'
        #        csvTestPath  = importFolderPath + '/train/*test*'
        trainhex = "train.hex"
        testhex = "test.hex"
        parseTrainResult = h2i.import_parse(
            bucket="home-0xdiag-datasets",
            path=csvTrainPath,
            schema="local",
            hex_key=trainhex,
            timeoutSecs=2400,
            doSummary=False,
        )
        parseTestResult = h2i.import_parse(
            bucket="home-0xdiag-datasets",
            path=csvTestPath,
            schema="local",
            hex_key=testhex,
            timeoutSecs=2400,
            doSummary=False,
        )
        inspect_test = h2o.nodes[0].inspect(testhex, timeoutSecs=8000)
        response = "IsDepDelayed"
        ignored_cols = "DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed"

        params = {
            "destination_key": "GBMScore",
            "response": response,
            "ignored_cols_by_name": ignored_cols,
            "classification": 1,
            "validation": None,
            "ntrees": 100,
            "max_depth": 10,
            "learn_rate": 0.00005,
        }

        parseResult = {"destination_key": trainhex}
        kwargs = params.copy()
        gbm = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=4800, **kwargs)

        scoreStart = time.time()
        h2o.nodes[0].generate_predictions(model_key="GBMScore", data_key=trainhex)
        scoreElapsed = time.time() - scoreStart

        print "It took ", scoreElapsed, " seconds to score ", inspect_test[
            "numRows"
        ], " rows. Using a GBM with 100 10-deep trees."
        print "That's ", 1.0 * scoreElapsed / 100.0, " seconds per 10-deep tree."
Ejemplo n.º 15
0
    def test_H_Basic(self):
        # maybe best to extra the key from an import? first?
        # this isn't used much, maybe we don't care about this

        h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv")
        headerKey = h2i.find_key('syn_header.csv')
        # comma 44 is separator
        h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv", header=1, header_from_file=headerKey, separator=44)
    
   
        # symbolic links work
        # ln -s /home/0xdiag/datasets home-0xdiag-datasets
        # lrwxrwxrwx 1 kevin kevin     21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets
        h2i.import_parse(path="standard/covtype.data", bucket="home-0xdiag-datasets")
Ejemplo n.º 16
0
    def test_C_kmeans_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex")
        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            params = {'k': 3, 
                     'initialization': 'Furthest', 
                     'ignored_cols': "ID",
                     'destination_key': 'prostate_k.hex',
                     'max_iter': 100,
                     'seed': 265211114317615310
                    }
            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Ejemplo n.º 17
0
    def test_rapids_basic(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'p'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for execExpr in exprList:
            r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr)
            resultKey = r.group(1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
            if DO_ROLLUP:
                h2o_cmd.runInspect(key=resultKey)
            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr)
            else:
                h2p.yellow_print("\nNo key created?\n", dump_json(execResult))

        print "\nExpressions that created keys. Shouldn't all of these expressions create keys"

        for k in keys:
            print k

        h2o.check_sandbox_for_errors()
Ejemplo n.º 18
0
    def test_GLM_poisson_rand2(self):
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 54, 
                'n_folds': 3, 
                'family': "poisson", 
                'alpha': 0.5, 
                'lambda': 1e-4, 
                'beta_epsilon': 0.001, 
                'max_iter': 15,
                }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 60 + (kwargs['n_folds']*40)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1)))

            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"
Ejemplo n.º 19
0
    def test_rapids_ifelse_nested(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for trial in range(2):
            for execObj, expected in zip(objList, resultList):
                freshObj = copy(execObj)
                result = freshObj.do()
                # do some scalar result checking
                if expected is not None:
                    # result is a string now??
                    print "result:", result
                    print "expected:", expected
                    assert float(result)==expected, "%s %s" (result,expected)

                # rows might be zero!
                print "freshObj:", dump_json(freshObj.execResult)
                if 'key' in freshObj.execResult and freshObj.execResult['key']:
                    keys.append(freshObj.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Ejemplo n.º 20
0
    def test_parse_1m_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [(10, 65000, "cH", 30)]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            print "Summary should work with 65k"
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True
            )
            print csvFilename, "parse time:", parseResult["response"]["time"]
            print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect["num_cols"],
                colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount),
            )
            self.assertEqual(
                inspect["num_rows"],
                rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect["num_rows"], rowCount),
            )

            # we should obey max_column_display
            column_limits = [25, 25000, 50000]
            for column_limit in column_limits:
                inspect = h2o_cmd.runInspect(
                    None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs
                )
                self.assertEqual(
                    len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit)
                )
                for r in range(0, len(inspect["rows"])):
                    # NB: +1 below because each row includes a row header row: #{row}
                    self.assertEqual(
                        len(inspect["rows"][r]),
                        column_limit + 1,
                        "inspect data rows obeys max_column_display = " + str(column_limit),
                    )
Ejemplo n.º 21
0
    def test_GLM2_tweedie(self):
        csvFilename = "AutoClaim.csv"
        csvPathname = 'standard/' + csvFilename
        print "\nStarting", csvPathname
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
        # columns start at 0
        # regress: glm(CLM_AMT ~ CAR_USE + REVOLKED + GENDER + AREA + MARRIED + CAR_TYPE, data=AutoClaim, family=tweedie(1.34))
        
        coefs = [7, 13, 20, 27, 21, 11]
        y = 4
        ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult['destination_key'], cols=coefs, response=y)

        # sapply(c('CLM_AMT', 'CAR_USE', 'REVOLKED', 'GENDER', 'AREA', 'MARRIED', 'CAR_TYPE'), function(x) which(x==colnames(AutoClaim)) - 1)
        kwargs = {
                'family': 'tweedie',
                'tweedie_variance_power': 1.36,
                'response': y, 
                'ignored_cols' : ignored_cols,
                'max_iter': 10, 
                'lambda': 0,
                'alpha': 0,
                'n_folds': 0,
                'beta_epsilon': 1e-4,
        }

        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs)

        coefficientsExpected = {'Intercept': 0, 'GENDER.M': 0.0014842488782470984, 'CAR_TYPE.Sports Car': 0.07786742314454961, 'MARRIED.Yes': 0.0007748552195851079, 'CAR_TYPE.SUV': 0.07267702940249621, 'CAR_TYPE.Pickup': 0.04952083408742968, 'CAR_TYPE.Van': 0.026422137690691405, 'CAR_TYPE.Sedan': 0.05128350794060489, 'CAR_USE.Private': -0.03050194832853935, 'REVOLKED.Yes': -0.05095942737408699}

        deltaExpected = 0.05
        (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None,   
            coefficientsExpected=coefficientsExpected, deltaExpected=deltaExpected, **kwargs)
        print 'coefficients: %s' % (str(coefficients))
Ejemplo n.º 22
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

# two row dataset gets this. Avoiding it for now
# java.lang.ArrayIndexOutOfBoundsException: 1
# at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange (10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange (10,100,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
Ejemplo n.º 23
0
    def import_frame(self, target_key, bucket, csvFilename, csvPathname, expected_rows, expected_cols):
        path = csvPathname + '/' + csvFilename
        parseResult = h2i.import_parse(bucket=bucket, path=path, hex_key=target_key, schema='put') # upload the file
        destination_key = parseResult['destination_key']  # we block until it's actually ready

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        actual_rows = inspect['numRows']
        actual_cols = inspect['numCols']

        print 'loaded frame "' + target_key +'" from path: ' + path
        print 'rows: ', actual_rows
        print 'cols: ', actual_cols

        # Don't have access to the testCase assert methods here because they aren't class methods. :-(
        assert expected_rows == actual_rows, "Expected " + str(expected_rows) + " but got " + str(actual_rows) + " for path: " + path
        assert expected_cols == actual_cols, "Expected " + str(expected_cols) + " but got " + str(actual_cols) + " for path: " + path

        # TODO: other info we could check
        # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
        #     h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
        # 
        # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        # h2o_cmd.infoFromSummary(summaryResult) # , noPrint=True
        return destination_key
Ejemplo n.º 24
0
    def test_B_kmeans_benign(self):
        h2o.beta_features = True
        csvPathname = "logreg"
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False)
        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            params = {'k': 3, 
                      'initialization': 'Furthest', 
                      'ignored_cols' : None, 
                      'destination_key': 'benign_k.hex',
                      'max_iter': 50,
                      'seed': 265211114317615310,
                     }
            kwargs = params.copy()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
Ejemplo n.º 25
0
    def test_exec2_poppush2_fail(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        exprList = []
        while (len(exprList)!=20):
            exprs = [random.choice(phrases) for j in range(random.randint(1,2))]
            # check if we have mean2() before function defn
            functionFound = False
            for i, e in enumerate(exprs):
                if 'function' in e:
                    functionFound = True
                    # h2o has problems with assigns after functions
                
            if functionFound and len(exprs)> 1:
                # pass
                exprList.append("".join(exprs))
            else:
                exprList.append("".join(exprs))


        # add this one for good measure (known fail)
        # exprList += "crunk=function(x){x+98};r.hex[,3]=4;"
        exprList += ["function(x){x+98};r.hex[,3]=4;"]

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)

        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
Ejemplo n.º 26
0
    def test_exec2_ddply_phrases(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'standard/covtype.data'
        csvPathname = "standard/covtype.shuffled.10pct.data"

        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey)


        for col in range(1,10):
            initList = [
                ('r.hex', 'r.hex=i.hex'),
                (None, "func1=function(x){max(x[,%s])}" % col),
                (None, "func2=function(x){a=3;nrow(x[,%s])*a}" % col),
                (None, "func3=function(x){apply(x[,%s],2,sum)/nrow(x[,%s])}" % (col, col) ),
                # (None, "function(x) { cbind( mean(x[,1]), mean(x[,%s]) ) }" % col),
                (None, "func4=function(x) { mean( x[,%s]) }" % col), 
                (None, "func5=function(x) { sd( x[,%s]) }" % col), 
                # (None, "func6=function(x) { quantile(x[,%s] , c(0.9) ) }" % col),
            ]
            for resultKey, execExpr in initList:
                h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60)

            for p in phrases:
                execExpr = "ddply(r.hex, c(2), " + p + ")" 
                h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
Ejemplo n.º 27
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in xrange (1,10,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put')

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
Ejemplo n.º 28
0
    def test_NOPASS_GLM2_weight_nan_fail(self):
        h2o.beta_features = True
        csvPathname = 'covtype/covtype.20k.data'
        hex_key = 'covtype.20k.hex'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')
        kwargs = {
            'destination_key': 'GLM_model_python_0_default_0', 
            'family': 'tweedie', 
            'tweedie_variance_power': 1.9999999, 
            'max_iter': 10, 
            'alpha': 0, 
            'lambda': 0,
            'response': 54, 
        }

        for trial in range(3):
            # params is mutable. This is default.
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs)
            h2o.check_sandbox_for_errors()
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Ejemplo n.º 29
0
    def test_parse_bad_30rows_fvec(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvPathname = SYNDATASETS_DIR + "/bad.data"
        dsf = open(csvPathname, "w+")
        dsf.write(datalines)
        dsf.close()

        for i in range(20):
            # every other one
            single_quotes = 1

            # force header=1 to make it not fail (doesn't deduce correctly)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", single_quotes=single_quotes, header=1, hex_key="trial" + str(i) + ".hex"
            )
            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"])
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            self.assertEqual(numCols, 4, "Parsed wrong number of cols: %s" % numCols)
            self.assertNotEqual(
                numRows,
                30,
                "Parsed wrong number of rows. Should be 29.\
                 Didn't deduce header?: %s"
                % numRows,
            )
            self.assertEqual(numRows, 29, "Parsed wrong number of rows: %s" % numRows)
Ejemplo n.º 30
0
    def test_parse_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, 'cA', 60),
            (100, 6000, 'cB', 60),
            (100, 7000, 'cC', 60),
            (100, 8000, 'cD', 60),
            (100, 8200, 'cE', 60),
            (100, 8500, 'cF', 60),
            (100, 9000, 'cG', 60),
            (100, 10000, 'cI', 60),
            (100, 11000, 'cH', 60),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Ejemplo n.º 31
0
    def test_GLM2_covtype_train_predict_all_all(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=180)

        execExpr = "A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1)  # class 1
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "Use same data (full) for train and test"
        trainDataKey = "A.hex"
        testDataKey = "A.hex"
        # start at 90% rows + 1

        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y + 1),
            'max_iter': 20,
            'n_folds': 0,
            # 'alpha': 0.1,
            # 'lambda': 1e-5,
            'alpha': 0.0,
            'lambda': None,
            'family': 'binomial',
        }
        timeoutSecs = 60

        for trial in range(1):
            # test/train split **********************************************8
            aHack = {'destination_key': trainDataKey}

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']
            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']
            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i, t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=testDataKey,
                                               model_key=modelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y + 1),
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            self.assertEqual(
                pctWrong, trainPctWrong,
                "Should see the same error rate on train and predict? (same data set)"
            )

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Ejemplo n.º 32
0
    def test_GBM_basic(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        model_key = 'GBMModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename

        # FIX! do I need to force enum for classification? what if I do regression after this?
        columnTypeDict = {54: 'Enum'}
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       columnTypeDict=columnTypeDict,
                                       schema='local',
                                       chunk_size=4194304,
                                       hex_key=train_key,
                                       timeoutSecs=timeoutSecs)

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        # run through a couple of parameter sets
        parameters = []
        parameters.append({
            'response_column': 'C55',
            'ntrees': 2,
            'max_depth': 10,
            'min_rows': 3,
            'nbins': 40,
            'learn_rate': 0.2,
            'loss': 'multinomial',
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            # 'variable_importance': False,
            # 'seed':
        })

        parameters.append({
            'response_column': 'C55',
            'loss': 'multinomial',
            # This does nothing! intent is solely based on type of response col
            'ntrees': 1,
            'max_depth': 20,
            'min_rows': 3,
            'nbins': 40,
            'learn_rate': 0.2,
        })

        model_key = 'covtype_gbm.hex'

        for p in parameters:
            bmResult = h2o.n0.build_model(algo='gbm',
                                          destination_key=model_key,
                                          training_frame=train_key,
                                          validation_frame=train_key,
                                          parameters=p,
                                          timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n"

            vis = OutputObj(model.variable_importances, 'vis')

            # just the first 10
            visDataChopped = [v[0:9] for v in vis.data]
            names = visDataChopped[0]
            relativeImportance = visDataChopped[1]
            print "names:", names
            print "relativeImportance:", relativeImportance
            scaledImportance = visDataChopped[2]
            percentage = visDataChopped[3]
            print "\nvis\n", tabulate(visDataChopped[1:], headers=names)
            # print "\nrelativeImportance (10)\n", tabulate(relativeImportance, headers=names)
            # print "\nscaledImportance (10)\n", tabulate(scaledImportance, headers=names)
            # print "\npercentage (10)\n", tabulate(percentage, headers=names)

            print "will say Regression or Classification. no Multinomial?"
            print "model.model_category", model.model_category
            assert model.model_category == 'Multinomial', model.model_category

            print "FIX! why is mse 0 and mes_train Nan?"
            print "model.mse:", model.mse
            print "model.mse_train:", model.mse_train

            if 1 == 1:
                print ""
                for i, c in enumerate(cmm.cm):
                    print "\ncmms.cm[%s]" % i, tabulate(c)
                print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame']  # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
Ejemplo n.º 33
0
    def test_GLM2_basic(self):
        importFolderPath = "logreg"
        csvFilename = 'prostate.csv'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print inspect
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        x = 'ID'
        y = 'CAPSULE'
        family = 'binomial'
        alpha = '0.5'
        lambda_ = '1E-4'
        nfolds = '0'
        f = 'prostate'
        modelKey = 'GLM_' + f

        kwargs = {
            'response': y,
            'ignored_cols': x,
            'family': family,
            'lambda': lambda_,
            'alpha': alpha,
            'n_folds': nfolds,  # passes if 0, fails otherwise
            'destination_key': modelKey,
        }

        timeoutSecs = 60
        start = time.time()
        glmResult = h2o_cmd.runGLM(parseResult=parseResult,
                                   timeoutSecs=timeoutSecs,
                                   retryDelaySecs=0.25,
                                   pollTimeoutSecs=180,
                                   **kwargs)

        # this stuff was left over from when we got the result after polling the jobs list
        # okay to do it again
        # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling)
        if 1 == 0:
            job_key = glmResult['job_key']
            # is the job finishing before polling would say it's done?
            params = {'job_key': job_key, 'destination_key': modelKey}
            glm = h2o.nodes[0].completion_redirect(
                jsonRequest="2/GLMProgressPage2.json", params=params)
            print "GLM result from completion_redirect:", h2o.dump_json(a)
        if 1 == 1:
            glm = h2o.nodes[0].glm_view(_modelKey=modelKey)
            ### print "GLM result from glm_view:", h2o.dump_json(a)

        h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        glm_model = glm['glm_model']
        _names = glm_model['_names']
        coefficients_names = glm_model['coefficients_names']
        submodels = glm_model['submodels'][0]

        beta = submodels['beta']
        norm_beta = submodels['norm_beta']
        iteration = submodels['iteration']

        validation = submodels['validation']
        auc = validation['auc']
        aic = validation['aic']
        null_deviance = validation['null_deviance']
        residual_deviance = validation['residual_deviance']

        print '_names', _names
        print 'coefficients_names', coefficients_names
        # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
        print 'beta', beta
        print 'iteration', iteration
        print 'auc', auc
Ejemplo n.º 34
0
    def test_KMeans_predict3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 1:
            outputClasses = 3
            y = 4  # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # Huh...now we apparently need the translate. Used to be:
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            #  translate = None
            # FIX! how do we know what the translate should be, when we predict?
            translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1 == 0:
            outputClasses = 6
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            outputClasses = 6
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            outputClasses = 6
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        else:
            outputClasses = 10
            y = 0  # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        def predict_and_compare_csvs(model_key,
                                     hex_key,
                                     predictHexKey,
                                     translate=None,
                                     y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            # FIX! apparently we lose the enum mapping when we slice out, and then csv download? we just get the number?
            # OH NO..it looks like we actually preserve the enum..it's in the csv downloaded
            # the prediction is the one that doesn't have it, because it's realated to clusters, which have no
            # notion of output classes
            h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) +
                          "]",
                          timeoutSecs=30)

            start = time.time()
            predictResult = h2o.nodes[0].generate_predictions(
                model_key=model_key,
                data_key=hexKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time(
            ) - start, 'seconds'
            print "predictResult:", h2o.dump_json(predictResult)

            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex",
                                      csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey,
                                      csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(
                csvSrcOutputPathname,
                msg="Original",
                colIndex=0,
                translate=translate,
                skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput) = compare_csv_at_one_col(
                csvPredictPathname,
                msg="Predicted",
                colIndex=0,
                skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1 - skipSrcOutputHeader) !=
                (rowNum2 - skipPredictHeader)):
                raise Exception(
                    "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" %
                    (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput,
                                                predictOutput)):
                # if float(o)!=float(p):
                if str(o) != str(p):
                    if wrong == 10:
                        print "Not printing any more mismatches\n"
                    elif wrong < 10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s" % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?

            # hack..need to fix this
            if 1 == 0:
                if pctWrong > 2.0:
                    raise Exception(
                        "pctWrong too high. Expect < 2% error because it's reusing training data"
                    )
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        numCols = inspect["numCols"]
        numRows = inspect["numRows"]

        seed = random.randint(0, sys.maxint)
        # should pass seed
        # want to ignore the response col? we compare that to predicted

        # if we tell kmeans to ignore a column here, and then use the model on the same dataset to predict
        # does the column get ignored? (this is last col, trickier if first col. (are the centers "right"
        kwargs = {
            'ignored_cols_by_name': response,
            'seed': seed,
            # "seed": 4294494033083512223,
            'k': outputClasses,
            'initialization': 'PlusPlus',
            # sometimes get [24, 29, 97] result with PlusPlus.
            # change to Furthest. maybe have to fix the seed above, but we'll see
            # I provide two legal results below
            'destination_key': 'kmeans_model',
            'max_iter': 1000
        }

        kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                   timeoutSecs=60,
                                   **kwargs)
        # this is what the size of each cluster was, when reported by training
        size = kmeans['model']['size']

        # tupleResultList is created like this: ( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) )
        # THIS DOES A PREDICT in it (we used to have to do the predict to get more training result info?)
        (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
            self, kmeans, csvPathname, parseResult, 'd', **kwargs)

        # the tupleResultList has the size during predict? compare it to the sizes during training
        # I assume they're in the same order.
        size2 = [t[1] for t in tupleResultList]
        if size != size2:
            raise Exception(
                "training cluster sizes: %s are not the same as what we got from predict on same data: %s",
                (size, size2))

        # hack...hardwire for iris here
        # keep this with sizes sorted
        expectedSizes = [
            [39, 50, 61],
            [38, 50, 62],
            # these are bad results that we get once in a while
            [22, 31, 97],
            [24, 29, 97],
            [24, 30, 96],
            [23, 31, 96],
        ]
        sortedSize = sorted(size)
        if sortedSize not in expectedSizes:
            raise Exception(
                "I got cluster sizes %s but expected one of these: %s " %
                (sortedSize, expectedSizes))

        # check center list (first center) has same number of cols as source data
        print "centers:", centers

        # we said to ignore the output so subtract one from expected
        self.assertEqual(
            numCols - 1, len(centers[0]),
            "kmeans first center doesn't have same # of values as dataset row %s %s"
            % (numCols - 1, len(centers[0])))
        # FIX! add expected
        # h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)

        error = kmeans['model']['total_within_SS']
        within_cluster_variances = kmeans['model']['within_cluster_variances']
        print "within_cluster_variances:", within_cluster_variances

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y  # zero-based index matches response col name

        print ""
        print "oh I see why I can't compare predict to actual, in kmeans"
        print "the cluster order doesn't have to match the output class enum order"
        print "so I don't know what cluster, each output class will be (kmeans)"
        print "all I can say is that the prediction distribution should match the original source distribution"
        print "have to figure out what to do"
        predictHexKey = 'predict_0.hex'
        pctWrong = predict_and_compare_csvs(model_key='kmeans_model',
                                            hex_key=hexKey,
                                            predictHexKey=predictHexKey,
                                            translate=translate,
                                            y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now

        # HACK ignoring error for now
        if 1 == 0:
            self.assertAlmostEqual(
                pctWrong,
                expectedPctWrong,
                delta=0.7,
                msg=
                "predicted pctWrong: %s should be small because we're predicting with training data"
                % pctWrong)
Ejemplo n.º 35
0
    def rf_covtype_train_oobe(self,
                              csvFilename,
                              checkExpectedResults=True,
                              expectedAuc=0.5):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between
        # shuffled and non-shuffled datasets
        importFolderPath = "standard"
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = numRows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [
            0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79
        ]
        expectScorePctRightList = [
            0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78
        ]

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]

        trial = 0
        for rowPct in [0.9]:
            trial += 1
            # Not using this now (did use it for slicing)
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r_" + csvFilename + "_" + str(trial)

            # just do random split for now
            dataKeyTrain = 'rTrain.hex'
            dataKeyTest = 'rTest.hex'

            response = "C55"
            h2o_cmd.createTestTrain(hex_key,
                                    dataKeyTrain,
                                    dataKeyTest,
                                    trainPercent=90,
                                    outputClass=4,
                                    outputCol=numCols - 1,
                                    changeToBinomial=not DO_MULTINOMIAL)
            sliceResult = {'destination_key': dataKeyTrain}

            # adjust timeoutSecs with the number of trees
            kwargs = paramDict.copy()
            kwargs['destination_key'] = "model_" + csvFilename + "_" + str(
                trial)
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that)
            rfv = h2o_cmd.runRF(parseResult=sliceResult,
                                timeoutSecs=timeoutSecs,
                                validation=dataKeyTest,
                                **kwargs)

            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs)
            # oobeTrainPctRight = 100 * (1.0 - error)
            oobeTrainPctRight = 100 - error
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest,
                                          modelKey=model_key,
                                          vactual=response,
                                          vpredict=1,
                                          expectedAuc=expectedAuc)
            print h2o.dump_json(rfvScoring)
            h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            print "hello7"
            (error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            fullScorePctRight = 100 - error

            h2o.nodes[0].generate_predictions(model_key=model_key,
                                              data_key=dataKeyTest)

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / numRows), "pct. of all rows"

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectTrainPctRightList, actualTrainPctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectScorePctRightList, actualScorePctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        return rfvScoring
Ejemplo n.º 36
0
    def test_parse_covtype_2_maprfs(self):
        csvFilenameAll = [
            # this was from a hdfs dfs -ls /datasets. ..bytes
            ("covtype.data", 75169317),
            ("TEST-poker1000.csv", 23582),
            ("WU_100KRows3KCols.csv", 1120591148),
            ("airlines_all.05p.csv", 607774430),
            ("and-testing.data", 23538333),
            ("arcene2_train.both", 2715738),
            ("arcene_train.both", 2715838),
            # ("bestbuy_test.csv", 152488777),
            # ("bestbuy_train.csv", 243806953),
            ("billion_rows.csv.gz", 1758523515),
            ("covtype.13x.data", 977210917),
            ("covtype.13x.shuffle.data", 977210917),
            ("covtype.4x.shuffle.data", 300678693),
            ("covtype4x.shuffle.data", 300678693),
            ("hhp.unbalanced.012.1x11.data.gz", 6566953),
            ("hhp.unbalanced.012.data.gz", 4233715),
            ("hhp.unbalanced.data.gz", 4235293),
            ("hhp2.os.noisy.0_1.data", 48381802),
            ("hhp2.os.noisy.9_4.data", 48397103),
            ("leads.csv", 2755),
            ("prostate_long_1G.csv", 1115287100),
            # ("3G_poker_shuffle", 3145728000),
            # ("covtype.169x.data", 12703751717),
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        for (csvFilename, totalBytes) in csvFilenameList:
            totalBytes = float(totalBytes)
            timeoutSecs = 900
            multiplyExpected = 1

            # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
            importFolderPath = "datasets"
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult  = h2i.import_parse(path=csvPathname, schema='maprfs', timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start

            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f}MB {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, (totalBytes+0.0)/1e6, fileMBS, elapsed)
            print "\n"+l
            # h2o.cloudPerfH2O.message(l)

            # chunk_size=4194304*2
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)

            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(1):
                print "Summary on column", i
                co = h2o_cmd.runSummary(key=parse_key, column=i)

            k = parseResult['frames'][0]['frame_id']['name']
            # print "parseResult:", dump_json(parseResult)
            a_node = h2o.nodes[0]
            frames_result = a_node.frames(key=k, row_count=5)
            # print "frames_result from the first parseResult key", dump_json(frames_result)

            # FIX! switch this to look at the summary result
            parseKeyIndexedCheck(frames_result, multiplyExpected)
            # don't want to spill keys
            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 37
0
    def test_GLM2_convergence_1(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 50,  'cD', 300),
            (100, 100, 'cE', 300),
            (100, 200, 'cF', 300),
            (100, 300, 'cG', 300),
            (100, 400, 'cH', 300),
            (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put')
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                    'max_iter': 10, 
                    'lambda': 1e-8,
                    'alpha': 0,
                    'n_folds': 0,
                    'beta_epsilon': 1e-4,
                    }

            kwargs['response'] = y
            emsg = None
            # FIX! how much should we loop here. 
            for i in range(3):
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can 
                # redo it in the browser for comparison
                (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, 
                    glm, None, allowFailWarning=True, **kwargs)

                if 1==0:
                    print "\n", "\ncoefficients in col order:"
                    # since we're loading the x50 file all the time..the real colCount 
                    # should be 50 (0 to 49)
                    showCols = colCount
                    for c in range(showCols):
                        print "%s:\t%.6e" % (c, coefficients[c])
                    print "intercept:\t %.6e" % intercept

                # gets the failed to converge, here, after we see it in the browser too
                x = re.compile("[Ff]ailed")
                if warnings:
                    for w in warnings:
                        if (re.search(x,w)): 
                            # first
                            if emsg is None: emsg = w
                            print w
                if emsg: break
        
            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(5)

            # gets the failed to converge, here, after we see it in the browser too
            if emsg is not None:
                raise Exception(emsg)
Ejemplo n.º 38
0
    def test_ddply_plot_multi(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (1000000, 5, 'cD', 0, 10, 30),
            (1000000, 5, 'cD', 0, 20, 30),
            (1000000, 5, 'cD', 0, 30, 30),
            (1000000, 5, 'cD', 0, 40, 30),
            (1000000, 5, 'cD', 0, 50, 30),
            (1000000, 5, 'cD', 0, 70, 30),
            (1000000, 5, 'cD', 0, 100, 30),
            (1000000, 5, 'cD', 0, 130, 30),
            (1000000, 5, 'cD', 0, 160, 30),
            # (1000000, 5, 'cD', 0, 320, 30),
            # starts to fail here. too many groups?
            # (1000000, 5, 'cD', 0, 640, 30),
            # (1000000, 5, 'cD', 0, 1280, 30),
        ]

        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt,
             timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname, "with range", (maxInt -
                                                                 minInt) + 1
            write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt,
                              SEEDPERFILE)

            # PARSE train****************************************
            hexKey = 'r.hex'
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hexKey)

            for resultKey, execExpr in initList:
                h2e.exec_expr(h2o.nodes[0],
                              execExpr,
                              resultKey=resultKey,
                              timeoutSecs=60)

            # do it twice..to get the optimal cached delay for time?
            execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=60)
            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed

            execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            groups = execResult['num_rows']
            maxExpectedGroups = ((maxInt - minInt) + 1)**2
            h2o_util.assertApproxEqual(
                groups,
                maxExpectedGroups,
                rel=0.2,
                msg="groups %s isn't close to expected amount %s" %
                (groups, maxExpectedGroups))

            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed
            print "execResult", h2o.dump_json(execResult)

            # should be same answer in both cases
            execExpr = "d=sum(a1!=a2)==0"
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            print "execResult", h2o.dump_json(execResult)
            self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result)

            # xList.append(ntrees)
            trial += 1
            # this is the biggest it might be ..depends on the random combinations
            # groups = ((maxInt - minInt) + 1) ** 2
            xList.append(groups)
            eList.append(ddplyElapsed)
            fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Ejemplo n.º 39
0
    def test_c5_KMeans_sphere_26GB(self):
        h2o.beta_features = False
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = 'syn_sphere_gen.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([
                0.0, -113.00566692375459, -89.99595447985321,
                -455.9970643424373, 4732.0, 49791778.0, 36800.0
            ], 248846122, 1308149283316.2988),
            ([
                0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                25654042.00592703, 28304.0
            ], 276924291, 1800760152555.98),
            ([
                0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                31319.99486705394
            ], 235089554, 375419158808.3253),
            ([
                0.0, 10.0, -72.00113070337981, -171.0198611715457,
                4430.00952228909, 37007399.0, 29894.0
            ], 166180630, 525423632323.6474),
            ([
                0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                22865824.99639042, 5335.0
            ], 167234179, 1845362026223.1094),
            ([
                0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                -47537.998050740985
            ], 195420925, 197941282992.43475),
            ([
                0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289,
                1928.0, 39967190.0, 27202.0
            ], 214401768, 11868360232.658035),
            ([
                0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                30712.99115201907
            ], 258853406, 598863991074.3276),
            ([
                0.0, 21.0, 114.01584574295777, 242.99690338815898,
                1674.0029079209912, 33089556.0, 36415.0
            ], 190979054, 1505088759456.314),
            ([
                0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                -48473733.04122273, 47343.0
            ], 87794427, 1124697008162.3955),
            ([
                0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                16716.003410920028
            ], 78226988, 1151439441529.0215),
            ([
                0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                -14930.007919032574
            ], 167273589, 693036940951.0249),
            ([
                0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                11767.998552236539
            ], 148426180, 35942838893.32379),
            ([
                0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                -23336.998167498707
            ], 157533313, 88431531357.62982),
            ([
                0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008,
                2320.0, 46602185.0, 11212.0
            ], 118361306, 1111537045743.7646),
        ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='hdfs',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='local',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'max_iter': 3,
                'normalize': 1,
                'initialization': 'Furthest',
                'destination_key': 'junk.hex',
                # we get NaNs if whole col is NA
                'cols': 'C1, C2, C3, C4, C5, C6, C7',
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       benchmarkLogging=benchmarkLogging,
                                       **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeans)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=True,
                                                trial=trial)
            h2i.delete_keys_at_all_nodes()
Ejemplo n.º 40
0
    def test_kmeans_benign(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"

        csvPathname = importFolderPath + "/" + csvFilename
        # FIX! hex_key isn't working with Parse2 ? parseResult['destination_key'] not right?
        print "\nStarting", csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([
                8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53,
                2.12, 128.61, 35.33, 1.57
            ], 49, None),
            ([
                33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30,
                0.37, 2.52, 125.40, 43.91, 1.79
            ], 87, None),
            ([
                27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53,
                0.58, 2.89, 171.27, 42.73, 1.53
            ], 55, None),
            ([
                26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44,
                0.22, 2.89, 234.56, 39.22, 1.56
            ], 9, None),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        for trial in range(1):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879

            kwargs = {
                'k': 4,
                'initialization': 'PlusPlus',
                'destination_key': 'benign_k.hex',
                # 'seed': 265211114317615310,
                'max_iter': 50,
                'seed': kmeansSeed,
            }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=5,
                                       **kwargs)

            ## h2o.verboseprint("kmeans result:", h2o.dump_json(kmeans))
            modelView = h2o.nodes[0].kmeans_view(model='benign_k.hex')
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
            model = modelView['model']
            clusters = model['centers']
            within_cluster_variances = model['within_cluster_variances']
            total_within_SS = model['total_within_SS']
            print "within_cluster_variances:", within_cluster_variances
            print "total_within_SS:", total_within_SS

            # make this fvec legal?
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Ejemplo n.º 41
0
    def test_kmeans_prostate(self):
        h2o.beta_features = True  # fvec

        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       header=1,
                                       timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers

        expected = [
            ([0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31], 215, 36955),
            ([0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31], 136, 46045),
            ([0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31], 29, 33412),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(1):
            # kmeansSeed = random.randint(0, sys.maxint)
            # actually can get a slightly better error sum with a different seed
            # this seed gets the same result as scikit
            kmeansSeed = 6655548259421773879

            kwargs = {
                'ignored_cols': 'ID',
                'k': 3,
                # 'initialization': 'Furthest',
                'initialization': 'PlusPlus',
                'destination_key': 'prostate_k.hex',
                'max_iter': 500,
                'seed': kmeansSeed,
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                # 'seed': 265211114317615310}
            }

            # for fvec only?
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=5,
                                       **kwargs)
            # FIX! how do I get the kmeans result?
            ### print "kmeans result:", h2o.dump_json(kmeans)
            # can't do this
            # inspect = h2o_cmd.runInspect(key='prostate_k.hex')
            modelView = h2o.nodes[0].kmeans_view(model='prostate_k.hex')
            h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))

            model = modelView['model']
            clusters = model['centers']
            within_cluster_variances = model['within_cluster_variances']
            total_within_SS = model['total_within_SS']
            print "within_cluster_variances:", within_cluster_variances
            print "total_within_SS:", total_within_SS
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                trial=trial)
Ejemplo n.º 42
0
    def test_summary2_NY0(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100, 200, 'x.hex', choicesList[4], expectedList[4]),
            (100, 200, 'x.hex', choicesList[5], expectedList[5]),
            (100, 200, 'x.hex', choicesList[6], expectedList[6]),
            (100, 200, 'x.hex', choicesList[7], expectedList[7]),
            (100, 200, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 200, 'x.hex', choicesList[2], expectedList[2]),
            (10000, 200, 'x.hex', choicesList[1], expectedList[1]),
            (100000, 200, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, choices)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(colCount):
                # walks across the columns triggering a summary on the col desired
                # runSummary returns a column object now. inspect and parse don't. They return json.
                # maybe eventually will make them return object? But I also pass expected stuff to them
                # should I pass expected to summary? no, more complex?
                co = h2o_cmd.runSummary(key=hex_key, column=i)
                print co.label, co.type, co.missing, co.domain, sum(co.bins)

                print "\nComparing column %s to expected" % i
                self.assertEqual(expectedNaCnt[i], co.missing, "Column %s Expected %s. missing: %s is incorrect" % \
                    (i, expectedNaCnt[i], co.missing))
                self.assertEqual(rowCount - expectedNaCnt[i], sum(co.bins))

            h2p.green_print("\nDone with trial", trial)
            trial += 1

            h2i.delete_keys_at_all_nodes()
Ejemplo n.º 43
0
    def test_parse_summary_manyfiles_1_fvec(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz", 600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_1.dat.gz"
            (importResult,
             importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                              path=csvPathname,
                                              schema='local',
                                              timeoutSecs=timeoutSecs)
            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120,
                                           doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(
                y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            # pass numRows, so we know when na cnt means row is all na's
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               timeoutSecs=360,
                                               numCols=numCols,
                                               numRows=numRows)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
Ejemplo n.º 44
0
    def test_parse_time(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = None
        colCount = COLS
        # rowCount = 1000
        rowCount = ROWS
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range (20):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "missingValuesListA", missingValuesListA

            numColsA = inspect['numCols']
            numRowsA = inspect['numRows']
            byteSizeA = inspect['byteSize']

            self.assertEqual(missingValuesListA, [], "missingValuesList should be empty")
            self.assertEqual(numColsA, colCount)
            self.assertEqual(numRowsA, rowCount)

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            # interesting. what happens when we do csv download with time data?
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "missingValuesListB", missingValuesListB

            numColsB = inspect['numCols']
            numRowsB = inspect['numRows']
            byteSizeB = inspect['byteSize']

            self.assertEqual(missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result")
            self.assertEqual(numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # so I guess that's okay. So allow for an extra row here.
            self.assertEqual(numRowsA, numRowsB,
                "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) )
            print "H2O writes the internal format (number) out for time."

            # ==> syn_time.csv <==
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30

            # ==> csvDownload.csv <==
            # "0","1","2","3","4","5"
            # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12

            if 1==0:
                # extra line for column headers?
                self.assertEqual(byteSizeA, byteSizeB,
                    "byteSize mismatches after re-parse of downloadCsv result %d %d" % (byteSizeA, byteSizeB) )

            # FIX! should do some comparison of values? 
            # maybe can use exec to checksum the columns and compare column list.
            # or compare to expected values? (what are the expected values for the number for time inside h2o?)

            # FIX! should compare the results of the two parses. The infoFromInspect result?
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Ejemplo n.º 45
0
    def sub_c2_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140 
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx-gz'
        print "Using .gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern


                # double import still causing problems?
                # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                # importFullList = importResult['files']
                # importFailList = importResult['fails']
                # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                        x.remove(i)
                        ignore_x.append(i)

                    # plus 1 because we are no longer 0 offset
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'family': 'binomial',
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Ejemplo n.º 46
0
    def test_GLM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (2, 100, 'cA', 300),
            # (4, 200, 'cA', 300),
            (10000, 1000, 'cB', 300),
            (10000, 3000, 'cC', 500),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            expected = []
            allowedDelta = 0

            labelListUsed = list(labelList)
            response = 'C' + str(len(labelListUsed) - 1)  # last column
            labelListUsed.remove(response)
            numColsUsed = numCols - 1
            for trial in range(1):
                # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
                # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
                # can we do classification with probabilities?
                # are only lambda and alpha grid searchable?
                parameters = {
                    'validation_frame': parse_key,
                    'ignored_columns': None,
                    # FIX! for now just use a column that's binomial
                    'response_column': response,  # can't take index now?
                    # FIX! when is this needed? redundant for binomial?
                    'balance_classes': False,
                    'max_after_balance_size': None,
                    'standardize': False,
                    'family': 'binomial',
                    'link': None,
                    'alpha': '[1e-4]',
                    'lambda': '[0.5,0.25, 0.1]',
                    'prior1': None,
                    'lambda_search': None,
                    'nlambdas': None,
                    'lambda_min_ratio': None,
                    # 'use_all_factor_levels': False,
                }
                model_key = 'many_cols_glm.hex'
                bmResult = h2o.n0.build_model(algo='glm',
                                              model_id=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=300)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                       labelListUsed)

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult, 'mm')

                prResult = h2o.n0.predict(model=model_key,
                                          frame=parse_key,
                                          timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'],
                               'pr')
Ejemplo n.º 47
0
    def test_RF_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]

        tryList = [
            (10000, 100, 'cA', 300),
            (10000, 300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            (10000, 3000, 'cK', 3600),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'RFModelKey'

            # Parse (train)****************************************
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=None,
                                                path=csvPathname,
                                                schema='put',
                                                header=0,
                                                hex_key=hex_key,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # RF(train iterate)****************************************
            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'nbins': 1024,
                    'classification': 1,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'response': 'C' + str(numCols - 1),
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for RF: ", params
                kwargs = params.copy()

                trainStart = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult,
                                              timeoutSecs=timeoutSecs,
                                              destination_key=modelKey,
                                              **kwargs)
                trainElapsed = time.time() - trainStart
                print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(
                    max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)
                rfResult["drf_model"] = rfResult.pop("speedrf_model")
                errsLast = rfResult['drf_model']['errs'][-1]
                print "RF 'errsLast'", errsLast

                cm = rfResult['drf_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Ejemplo n.º 48
0
    def test_c9_GLM_airlines_fvec(self):
        files = [
                 ('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda': 1e-8,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 3,
                'family': 'binomial',
                'destination_key': "GLMKEY",
                'response': response,
                'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']

            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']
            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i,t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
            # FIX! should look at prediction error/class error?
            # self.assertLess(pctWrong, 9,"Should see less than 40% error")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm['_arr'])

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=trainKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=trainKey,
                vactual=response,
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']
            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            # self.assertLess(pctWrong, 40,"Should see less than 40% error")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)


        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Ejemplo n.º 49
0
    def test_GLM_allstate_s3n_thru_hdfs(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'allstate'
        csvFilename = "train_set.csv"
        csvPathname = importFolderPath + "/" + csvFilename
        timeoutSecs = 500
        trialMax = 3
        for trial in range(trialMax):
            trialStart = time.time()
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='s3n',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print "parse end on ", hex_key, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {
                # allstate claim last col
                'y': 34,
                'case_mode': '>',
                'case': 0,
                'family': 'binomial',
                'link': 'logit',
                'n_folds': 2,
                'max_iter': 8,
                'beta_epsilon': 1e-3
            }

            timeoutSecs = 500
            # L2
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=60,
                                 noise=('JStack', None),
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
            h2o.check_sandbox_for_errors()

            # Elastic
            kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=60,
                                 noise=('JStack', None),
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (Elastic) end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
            h2o.check_sandbox_for_errors()

            # L1
            kwargs.update({'alpha': 1.0, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=60,
                                 noise=('JStack', None),
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L1) end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
            h2o.check_sandbox_for_errors()

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
Ejemplo n.º 50
0
    def test_four_billion_rows_fvec(self):
        h2o.beta_features = True
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
        ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=180)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            byteSize = inspect['byteSize']
            print "\n" + csvFilename, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols), \
                "    byteSize:", "{:,}".format(byteSize)

            expectedRowSize = numCols * 1  # plus output
            # expectedValueSize = expectedRowSize * numRows
            expectedValueSize = 8001271520
            self.assertEqual(byteSize, expectedValueSize,
                msg='byteSize %s is not expected: %s' % \
                (byteSize, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                numCols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, numCols))
            self.assertEqual(4 * 1000000000,
                             numRows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, numRows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'max_iter': 4,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'response': 'C1',
                'n_folds': 0,
                'family': 'binomial',
            }
            # one coefficient is checked a little more
            colX = 1

            # convert to binomial
            execExpr = "A.hex=%s" % parseResult['destination_key']
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
            execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('C1', 'C1', 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
            aHack = {'destination_key': "A.hex"}

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
Ejemplo n.º 51
0
    def test_many_fp_formats_libsvm_fvec(self):
        h2o.beta_features = True
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30, 'sparse'),
            (100, 100, 'cF', 30, 'sparse50'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict,
                 colNumberMax) = write_syn_dataset(csvPathname, rowCount,
                                                   colCount, SEEDPERFILE, sel,
                                                   distribution)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=selKey2,
                                               timeoutSecs=timeoutSecs)
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0, key=parseResult['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2,
                                                       timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, numCols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use numCols?. numCols should be <= colCount.

                colSumList = h2e.exec_expr_list_across_cols(
                    None,
                    exprList,
                    selKey2,
                    maxCol=colNumberMax + 1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k, v in synColSumDict.iteritems():
                    if k > colNumberMax:  # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(
                        k >= 0 and k < len(colSumList),
                        msg="k: %s len(colSumList): %s numCols: %s" %
                        (k, len(colSumList), numCols))

                    syn = {}
                    if k == 0:
                        syn['name'] = "C1"
                        syn['type'] = {'Int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        # syn['scale'] = {1}
                    elif k == 1:  # we forced this to always be 0
                        syn['name'] = "C2"
                        syn['type'] = {'Int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        # syn['scale'] = {1}
                    else:
                        syn['name'] = "C" + str(k + 1)
                        syn['type'] = {'Int', 'Real'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        # syn['scale'] = {1,10,100,1000}

                    syn['naCnt'] = 0
                    syn['cardinality'] = -1
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(
                                syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(
                                syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                print "cols min/max:", cols['min'], cols['max']
                                print "syn min/max:", syn['min'], syn['max']
                                raise Exception(
                                    'col %s %s %s should be in this allowed %s'
                                    % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(
                                syn[synKey],
                                cols[synKey],
                                msg='col %s %s %s should be %s' %
                                (k, synKey, cols[synKey], syn[synKey]))

                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(
                        float(v),
                        colSum,
                        places=0,
                        msg='%0.6f col sum is not equal to expected %0.6f' %
                        (v, colSum))
Ejemplo n.º 52
0
    def GLM_syn_eqns_data(self,
                          ALGO='binomial',
                          DATA_VALUE_MIN=-1,
                          DATA_VALUE_MAX=1,
                          COEFF_VALUE_MIN=-1,
                          COEFF_VALUE_MAX=1,
                          INTCPT_VALUE_MIN=-1,
                          INTCPT_VALUE_MAX=1,
                          DATA_DISTS='unique_pos_neg'):

        SYNDATASETS_DIR = h2o.make_syn_dir()

        if ALGO == 'poisson':
            tryList = [
                (50000, 5, 'cD', 300),
            ]
        else:
            tryList = [
                # (100, 1, 'cA', 300),
                # (100, 25, 'cB', 300),
                # (1000, 25, 'cC', 300),
                # 50 fails, 40 fails
                # (10000, 50, 'cD', 300),
                # 30 passes
                # (10000, 30, 'cD', 300),
                # 200 passed
                (500, 30, 'cD', 300),
                (500, 30, 'cD', 300),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            modeString = \
                "_Bins" + str(BINS) + \
                "_Dmin" + str(DATA_VALUE_MIN) + \
                "_Dmax" + str(DATA_VALUE_MAX) + \
                "_Cmin" + str(COEFF_VALUE_MIN) + \
                "_Cmax" + str(COEFF_VALUE_MAX) + \
                "_Imin" + str(INTCPT_VALUE_MIN) + \
                "_Imax" + str(INTCPT_VALUE_MAX) + \
                "_Ddist" + str(DATA_DISTS)
            print "modeString:", modeString

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + modeString + "_" + str(
                SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, \
                "using random coefficients and intercept and logit eqn. for output"
            (coefficientsGen, interceptGen) = gen_rand_equation(
                colCount, INTCPT_VALUE_MIN, INTCPT_VALUE_MAX, COEFF_VALUE_MIN,
                COEFF_VALUE_MAX, SEEDPERFILE)
            print coefficientsGen, interceptGen

            write_syn_dataset(csvPathname, rowCount, colCount, coefficientsGen,
                              interceptGen, DATA_VALUE_MIN, DATA_VALUE_MAX,
                              DATA_DISTS, ALGO, SEED)

            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           schema='put',
                                           timeoutSecs=60)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            y = colCount
            print "GLM is ignoring the thresholds I give it? deciding what's best?"
            kwargs = {
                'family': ALGO,
                'y': y,
                'max_iter': 10,
                'lambda': 0,
                'alpha': 0,
                'n_folds': 0,
                'beta_epsilon': 1e-4,
                # 'thresholds': 0.5,
            }

            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            (warnings, coefficients,
             intercept) = h2o_glm.simpleCheckGLM(self, glm, 'C1', **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            if ALGO == 'binomial':
                deltaCoeff = 0.1
                deltaIntcpt = 0.2
            else:  # poisson needs more?
                deltaCoeff = 0.4
                deltaIntcpt = 1.0

            for i, c in enumerate(coefficients):
                g = coefficientsGen[i]  # generated
                print "coefficient[%d]: %8.4f,    generated: %8.4f,    delta: %8.4f" % (
                    i, c, g, abs(g - c))
                self.assertAlmostEqual(
                    c,
                    g,
                    delta=deltaCoeff,
                    msg="not close enough. coefficient[%d]: %s,    generated %s"
                    % (i, c, g))

            c = intercept
            g = interceptGen
            print "intercept: %8.4f,    generated: %8.4f,    delta: %8.4f" % (
                c, g, abs(g - c))
            print "need a larger delta compare for intercept?"
            self.assertAlmostEqual(
                c,
                g,
                delta=deltaIntcpt,
                msg="not close enough. intercept: %s,    generated %s" %
                (c, g))
Ejemplo n.º 53
0
    def test_DL_airlines_small(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'train.hex'
        validation_key = 'validation.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)

        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)
        pAV = h2o_cmd.ParseObj(parseResultV)
        iAV = h2o_cmd.InspectObj(pAV.parse_key)

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': "['IsDepDelayed_REC']",  # string[] None
            'response_column': 'IsDepDelayed',  # string None
            'loss': 'CrossEntropy'
        }
        expectedErr = 0.32  ## expected validation error for the above model
        relTol = 0.15  ## 15% rel. error tolerance due to Hogwild!

        timeoutSecs = 60
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      destination_key=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        h2o_cmd.runStoreView()

        actualErr = model['errors']['valid_err']
        print "expected classification error: " + format(expectedErr)
        print "actual   classification error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Ejemplo n.º 54
0
    def test_hdfs_cdh5(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle"
            ("and-testing.data", 60),
            ### "arcene2_train.both",
            ### "arcene_train.both",
            ### "bestbuy_test.csv",
            ("covtype.data", 60),
            ("covtype4x.shuffle.data", 60),
            # "four_billion_rows.csv",
            ("hhp.unbalanced.012.data.gz", 60),
            ("hhp.unbalanced.data.gz", 60),
            ("leads.csv", 60),
            # ("covtype.169x.data", 1200),
            ("prostate_long_1G.csv", 200),
            ("airlines_all.csv", 1200),
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        trial = 0
        print "try importing /tmp2"
        d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000)
        for (csvFilename, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a.hex"
            csvPathname = "datasets/" + csvFilename

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           timeoutSecs=1000)
            print "hdfs parse of", csvPathname, "took", time.time(
            ) - start, 'secs'
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            if DO_EXPORT:
                start = time.time()
                print "Saving", csvFilename, 'to HDFS'
                print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
                print "Unique per-user to avoid permission issues"
                username = getpass.getuser()
                csvPathname = "tmp2/a%s.%s.csv" % (trial, username)
                # reuse the file name to avoid running out of space
                csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files',
                                                   username)

                path = "hdfs://" + h2o.nodes[
                    0].hdfs_name_node + "/" + csvPathname
                h2o.nodes[0].export_files(src_key=hex_key,
                                          path=path,
                                          force=1,
                                          timeoutSecs=timeoutSecs)
                print "export_files of", hex_key, "to", path, "took", time.time(
                ) - start, 'secs'
                trial += 1

                print "Re-Loading", csvFilename, 'from HDFS'
                start = time.time()
                hex_key = "a2.hex"
                time.sleep(2)
                d = h2i.import_only(path=csvPathname,
                                    schema='hdfs',
                                    timeoutSecs=1000)
                print h2o.dump_json(d)
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='hdfs',
                                               hex_key=hex_key,
                                               timeoutSecs=1000)
                print "hdfs re-parse of", csvPathname, "took", time.time(
                ) - start, 'secs'
Ejemplo n.º 55
0
    def test_GLM_mnist_reals(self):
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
        ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            csvPathname = importFolderPath + "/" + testCsvFilename
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=testKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            csvPathname = importFolderPath + "/" + trainCsvFilename
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)
            print "x:", x

            params = {
                'x': x,
                'y': y,
                'case_mode': '=',
                'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
            }

            for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs['case'] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=60,
                                     **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm['GLMModel']
                modelKey = GLMModel['model_key']

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey,
                                               model_key=modelKey,
                                               thresholds="0.5",
                                               timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in",  elapsed, "secs", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
Ejemplo n.º 56
0
    def test_fp_many_cols_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if H2O_SUPPORTS_OVER_500K_COLS:
            tryList = [
                (100, 200000, 'cG', 120, 120),
                (100, 300000, 'cH', 120, 120),
                (100, 400000, 'cI', 120, 120),
                (100, 500000, 'cJ', 120, 120),
                (100, 700000, 'cL', 120, 120),
                (100, 800000, 'cM', 120, 120),
                (100, 900000, 'cN', 120, 120),
                (100, 1000000, 'cO', 120, 120),
                (100, 1200000, 'cK', 120, 120),
            ]
        else:
            print "Restricting number of columns tested to <=500,000"
            tryList = [
                (100, 200000, 'cG', 400, 400),
                (100, 300000, 'cH', 400, 400),
                (100, 400000, 'cI', 400, 400),
                (100, 500000, 'cJ', 400, 400),
            ]

        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            sel = 0
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount,
                                                   colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            h2o.check_sandbox_for_errors()
            print "Parse and summary:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))
Ejemplo n.º 57
0
    def test_GBM_basic_regress(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        model_key = 'GBMModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=train_key,
                                       timeoutSecs=timeoutSecs)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': None,
            'response_column': 'C55',
            # 'balance_classes':
            # 'max_after_balance_size':
            'ntrees': 2,
            'max_depth': 10,
            'min_rows': 3,
            'nbins': 40,
            'learn_rate': 0.2,
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            # 'variable_importance': False,
            # 'seed':
        }

        model_key = 'covtype_gbm.hex'
        bmResult = h2o.n0.build_model(algo='gbm',
                                      destination_key=model_key,
                                      training_frame=parse_key,
                                      parameters=parameters,
                                      timeoutSecs=60)
        bm = OutputObj(bmResult, 'bm')

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=parse_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')
        # just check that it's something non-zero
        # assert cmm.cm['prediction_error']!=0.0

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=parse_key,
                                        timeoutSecs=60)
        mmResultShort = mmResult['model_metrics'][0]
        del mmResultShort['frame']  # too much!
        mm = OutputObj(mmResultShort, 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=parse_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
Ejemplo n.º 58
0
    def test_csv_download_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5000, 10000, 'cK', 60),
            (10000, 10000, 'cL', 60),
            (50000, 10000, 'cM', 60),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        trial = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            trial += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            hex_key=hex_key,
                                            timeoutSecs=timeoutSecs)
            print "\nA Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs)
            missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname)
            num_colsA = inspect['num_cols']
            num_rowsA = inspect['num_rows']
            row_sizeA = inspect['row_size']
            value_size_bytesA = inspect['value_size_bytes']

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            print "\nStarting csv download to", csvDownloadPathname, "rowCount:", rowCount, "colCount:", colCount
            start = time.time()
            h2o.nodes[0].csv_download(src_key=hex_key,
                                      csvPathname=csvDownloadPathname)
            print "csv_download end.", 'took', time.time(
            ) - start, 'seconds. Originally from:', csvFilename

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            hex_key=hex_key,
                                            timeoutSecs=timeoutSecs)
            print "\nB Trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=timeoutSecs)
            missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname)
            num_colsB = inspect['num_cols']
            num_rowsB = inspect['num_rows']
            row_sizeB = inspect['row_size']
            value_size_bytesB = inspect['value_size_bytes']

            self.assertEqual(
                missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                num_colsA, num_colsB,
                "num_cols mismatches after re-parse of downloadCsv result %d %d"
                % (num_colsA, num_colsB))
            self.assertEqual(
                num_rowsA, num_rowsB,
                "num_rows mismatches after re-parse of downloadCsv result %d %d"
                % (num_rowsA, num_rowsB))
            self.assertEqual(
                row_sizeA, row_sizeB,
                "row_size mismatches after re-parse of downloadCsv result %d %d"
                % (row_sizeA, row_sizeB))
            self.assertEqual(
                value_size_bytesA, value_size_bytesB,
                "value_size_bytes mismatches after re-parse of downloadCsv result %d %d"
                % (value_size_bytesA, value_size_bytesB))

            h2o.check_sandbox_for_errors()
Ejemplo n.º 59
0
    def test_rf_parity_cmp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [50000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   ntrees=ntrees,
                                   max_depth=40,
                                   response=response,
                                   timeoutSecs=600,
                                   retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        ntrees=ntrees,
                                        max_depth=40,
                                        response=response,
                                        timeoutSecs=600,
                                        retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,
                    k) in enumerate(zip(classErrorPctList1,
                                        classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i],
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(
                totalError1,
                totalError2,
                delta=.2 * totalError2,
                msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(
                elapsed1,
                elapsed2,
                delta=.5 * elapsed2,
                msg="Comparing RF times for DRF2 and SpeeDRF")

        # always match the gen above!
        for trial in range(1):
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=30,
                doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            response = "C" + str(numCols)
            ntrees = 30

            doBoth()
            print "*****************************"
            print "end # %s RF compare" % trial,
            print "*****************************"

            print "Now change all cols to enums"
            for e in range(numCols):
                enumResult = h2o.nodes[0].to_enum(src_key=hex_key,
                                                  column_index=(e + 1))

            doBoth()
            print "*********************************"
            print "end # %s RF compare, with enums #" % trial,
            print "*********************************"
Ejemplo n.º 60
0
    def test_impute_with_na(self):
        h2b.browseTheCloud()

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       schema='local',
                                       timeoutSecs=20)

        print "Just insert some NAs and see what happens"
        inspect = h2o_cmd.runInspect(key=hex_key)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        missing_fraction = 0.5

        # NOT ALLOWED TO SET AN ENUM COL?
        if 1 == 0:
            # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec?
            # just one in row 1
            for enumCol in enumColList:
                print "hack: Putting NA in row 0 of col %s" % enumCol
                execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after exec:", missingValuesList
            if len(missingValuesList) != len(enumColList):
                raise Exception(
                    "Didn't get missing values in expected number of cols: %s %s"
                    % (enumColList, missingValuesList))

        for trial in range(1):
            # copy the dataset
            hex_key2 = 'c.hex'
            execExpr = '%s = %s' % (hex_key2, hex_key)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            imvResult = h2o.nodes[0].insert_missing_values(
                key=hex_key2, missing_fraction=missing_fraction, seed=SEED)
            print "imvResult", h2o.dump_json(imvResult)

            # maybe make the output col a factor column
            # maybe one of the 0,1 cols too?
            # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns.
            # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3)

            print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before"
            expectedMissing = missing_fraction * origNumRows  # per col
            enumColList = [49, 50, 51, 52, 53, 54]
            for e in enumColList:
                enumResult = h2o.nodes[0].to_enum(src_key=hex_key2,
                                                  column_index=(e + 1))

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(origNumRows, numRows)
            self.assertEqual(origNumCols, numCols)

            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList", missingValuesList

            # this is an approximation because we can't force an exact # of missing using insert_missing_values
            if len(missingValuesList) != numCols:
                raise Exception(
                    "Why is missingValuesList not right afer ToEnum2?: %s %s" %
                    (enumColList, missingValuesList))
            for mv in missingValuesList:
                h2o_util.assertApproxEqual(
                    mv,
                    expectedMissing,
                    rel=0.1 * mv,
                    msg='mv %s is not approx. expected %s' %
                    (mv, expectedMissing))

            summaryResult = h2o_cmd.runSummary(key=hex_key2)
            h2o_cmd.infoFromSummary(summaryResult)
            # h2o_cmd.infoFromSummary(summaryResult)

            print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect"
            print "trial", trial
            print "expectedMissing:", expectedMissing

            print "Now get rid of all the missing values, by imputing means. We know all columns should have NAs from above"
            print "Do the columns in random order"

            # don't do the enum cols ..impute doesn't support right?
            if AVOID_BUG:
                shuffledColList = range(0, 49)  # 0 to 48
                execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
                # summaryResult = h2o_cmd.runSummary(key=hex_key2)
                # h2o_cmd.infoFromSummary(summaryResult)
                inspect = h2o_cmd.runInspect(key=hex_key2)
                numCols = inspect['numCols']
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                print "missingValuesList after impute:", missingValuesList
                if len(missingValuesList) != 49:
                    raise Exception(
                        "expected missing values in all cols after pruning enum cols: %s"
                        % missingValuesList)
            else:
                shuffledColList = range(0, 55)  # 0 to 54

            origInspect = inspect
            random.shuffle(shuffledColList)

            for column in shuffledColList:
                # get a random set of column. no duplicate. random order? 0 is okay? will be []
                groupBy = random.sample(range(55), random.randint(0, 54))
                # header names start with 1, not 0. Empty string if []
                groupByNames = ",".join(
                    map(lambda x: "C" + str(x + 1), groupBy))

                # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap
                columnName = "C%s" % (column + 1)
                print "don't use mode if col isn't enum"
                badChoices = True
                while badChoices:
                    method = random.choice(["mean", "median", "mode"])
                    badChoices = column not in enumColList and method == "mode"

                NEWSEED = random.randint(0, sys.maxint)
                print "does impute modify the source key?"
                # we get h2o error (argument exception) if no NAs
                impResult = h2o.nodes[0].impute(source=hex_key2,
                                                column=column,
                                                method=method)

            print "Now check that there are no missing values"
            print "FIX! broken..insert missing values doesn't insert NAs in enum cols"

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows2 = inspect['numRows']
            numCols2 = inspect['numCols']
            self.assertEqual(
                numRows, numRows2,
                "imput shouldn't have changed frame numRows: %s %s" %
                (numRows, numRows2))
            self.assertEqual(
                numCols, numCols2,
                "imput shouldn't have changed frame numCols: %s %s" %
                (numCols, numCols2))

            # check that the mean didn't change for the col
            # the enum cols with mode, we'll have to think of something else
            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after impute:", missingValuesList
            if missingValuesList:
                raise Exception(
                    "Not expecting any missing values after imputing all cols: %s"
                    % missingValuesList)

            cols = inspect['cols']
            origCols = origInspect['cols']

            print "\nFIX! ignoring these errors. have to figure out why."
            for i, (c, oc) in enumerate(zip(cols, origCols)):
                # I suppose since we impute to either median or mean, we can't assume the mean stays the same
                # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true
                ### h2o_util.assertApproxEqual(c['mean'], oc['mean'], tol=0.000000001,
                ###    msg="col %i original mean: %s not equal to mean after impute: %s" % (i, c['mean'], oc['mean']))
                if not h2o_util.approxEqual(
                        oc['mean'], c['mean'], tol=0.000000001):
                    msg = "col %i original mean: %s not equal to mean after impute: %s" % (
                        i, oc['mean'], c['mean'])
                    print msg