Example #1
0
def runScore(node=None,
             dataKey=None,
             modelKey=None,
             predictKey='Predict.hex',
             vactual='C1',
             vpredict=1,
             expectedAuc=None,
             expectedAucTol=0.15,
             doAUC=True,
             timeoutSecs=200):
    # Score *******************************
    # this messes up if you use case_mode/case_vale above
    predictKey = 'Predict.hex'
    start = time.time()

    predictResult = runPredict(data_key=dataKey,
                               model_key=modelKey,
                               destination_key=predictKey,
                               timeoutSecs=timeoutSecs)

    # inspect = runInspect(key=dataKey)
    # print dataKey, dump_json(inspect)

    # just get a predict and AUC on the same data. has to be binomial result
    if doAUC:
        resultAUC = h2o_nodes.nodes[0].generate_auc(thresholds=None,
                                                    actual=dataKey,
                                                    predict='Predict.hex',
                                                    vactual=vactual,
                                                    vpredict=vpredict)

        auc = resultAUC['aucdata']['AUC']

        if expectedAuc:
            h2o_util.assertApproxEqual(
                auc,
                expectedAuc,
                tol=expectedAucTol,
                msg="actual auc: %s not close enough to %s" %
                (auc, expectedAuc))

    # don't do this unless binomial
    predictCMResult = h2o_nodes.nodes[0].predict_confusion_matrix(
        actual=dataKey,
        predict=predictKey,
        vactual=vactual,
        vpredict='predict',
    )

    # print "cm", dump_json(predictCMResult)

    # These will move into the h2o_gbm.py
    # if doAUC=False, means we're not binomial, and the cm is not what we expect
    if doAUC:
        cm = predictCMResult['cm']
        pctWrong = h2o_gbm.pp_cm_summary(cm)
        print h2o_gbm.pp_cm(cm)

    return predictCMResult
Example #2
0
def scoreRF(scoreParseResult,
            trainResult,
            vactual=None,
            timeoutSecs=120,
            **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    if h2o.beta_features:
        # this is how we're supposed to do scorin?
        rfModelKey = trainResult['drf_model']['_key']
        predictKey = 'Predict.hex'
        start = time.time()
        predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                           model_key=rfModelKey,
                                           destination_key=predictKey,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)

        h2o_cmd.runInspect(key='Predict.hex', verbose=True)

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=parseKey,
            vactual=vactual,
            predict=predictKey,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        rftime = time.time() - start

        cm = predictCMResult['cm']

        # These will move into the h2o_gbm.py
        pctWrong = h2o_gbm.pp_cm_summary(cm)
        print "\nTest\n==========\n"
        print h2o_gbm.pp_cm(cm)
        scoreResult = predictCMResult

    else:
        ntree = trainResult['ntree']
        rfModelKey = trainResult['model_key']
        start = time.time()
        # NOTE: response_variable is required, and passed from kwargs here
        # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set
        # compared to training
        kwargs['out_of_bag_error_estimate'] = 0
        scoreResult = h2o_cmd.runRFView(None,
                                        parseKey,
                                        rfModelKey,
                                        ntree=ntree,
                                        timeoutSecs=timeoutSecs,
                                        **kwargs)

    rftime = time.time() - start
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs = 1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(
            bucket="home-0xdiag-datasets",
            path=importFolderPath + "/" + csvFilename,
            schema="put",
            hex_key=trainKey,
            timeoutSecs=timeoutSecs,
        )

        elapsed = time.time() - start
        print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
            (elapsed * 100) / timeoutSecs
        )
        print "parse result:", parseResult["destination_key"]

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = {
            "classification": 1,  # faster?
            "destination_key": modelKey,
            "learn_rate": 0.1,
            "ntrees": 3,
            "max_depth": 8,
            "min_rows": 1,
            "response": 0,  # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        # noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView["gbm_model"]["errs"][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cm = gbmTrainView["gbm_model"]["cms"][-1]  # use the last one
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
Example #4
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs=1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',  path=importFolderPath + "/" + csvFilename, schema='put',
            hex_key=trainKey, timeoutSecs=timeoutSecs)

        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = { 
            'classification': 1, # faster? 
            'destination_key': modelKey,
            'learn_rate': .1,
            'ntrees': 3,
            'max_depth': 8,
            'min_rows': 1,
            'response': 0, # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        #noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cms = gbmTrainView['gbm_model']['cms']
            cm = cms[-1]['_arr'] # use the last one
            print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr']
            print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr']
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Example #5
0
    def test_GBM_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')
        colNames = [
            'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL',
            'GLEASON'
        ]

        modelKey = 'GBM_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'validation': parseResult['destination_key'],
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': 10,
            'max_depth': 20,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                        noPoll=True,
                                        **kwargs)
        print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
        # no pattern waits for all
        h2o_jobs.pollWaitJobs(pattern=None,
                              timeoutSecs=300,
                              pollTimeoutSecs=10,
                              retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cm = gbmTrainView['gbm_model']['cms'][-1][
                '_arr']  # use the last one
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(
                gbmTrainView['gbm_model']['errs'])
Example #6
0
    def test_GBM_basic_benign(self):
        h2o.beta_features = True
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        
        # fails with n_folds
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]
        modelKey = 'GBM_benign'

        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'validation': parseResult['destination_key'],
            'ignored_cols_by_name': 'STR',
            'learn_rate': .1,
            'ntrees': 10,
            'max_depth': 20,
            'min_rows': 1,
            'response': 'FNDX',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
        print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
        # no pattern waits for all
        h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        # errrs from end of list? is that the last tree?
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            print h2o.dump_json(gbmTrainView['gbm_model']['cms'][-1])
            cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']# use the last one
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Example #7
0
File: h2o_rf.py Project: Jfeng3/h2o
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    if h2o.beta_features:
        # this is how we're supposed to do scorin?
        rfModelKey  = trainResult['drf_model']['_key']
        predictKey = 'Predict.hex'
        start = time.time()
        predictResult = h2o_cmd.runPredict(
            data_key=parseKey,
            model_key=rfModelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key='Predict.hex', verbose=True)

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=parseKey,
            vactual=vactual,
            predict=predictKey,
            vpredict='predict', 
            timeoutSecs=timeoutSecs, **kwargs)
            
        rftime      = time.time()-start 

        cm = predictCMResult['cm']

        # These will move into the h2o_gbm.py
        pctWrong = h2o_gbm.pp_cm_summary(cm);
        print "\nTest\n==========\n"
        print h2o_gbm.pp_cm(cm)
        scoreResult = predictCMResult

    else:
        ntree = trainResult['ntree']
        rfModelKey  = trainResult['model_key']
        start = time.time()
        # NOTE: response_variable is required, and passed from kwargs here
        # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set
        # compared to training
        kwargs['out_of_bag_error_estimate'] = 0
        scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs)

    rftime      = time.time()-start 
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
Example #8
0
def runScore(node=None, dataKey=None, modelKey=None, predictKey='Predict.hex', 
    vactual='C1', vpredict=1, expectedAuc=None, doAUC=True, timeoutSecs=200):
    # Score *******************************
    # this messes up if you use case_mode/case_vale above
    predictKey = 'Predict.hex'
    start = time.time()

    predictResult = runPredict(
        data_key=dataKey,
        model_key=modelKey,
        destination_key=predictKey,
        timeoutSecs=timeoutSecs)

    # inspect = runInspect(key=dataKey)
    # print dataKey, dump_json(inspect)

    # just get a predict and AUC on the same data. has to be binomial result
    if doAUC:
        resultAUC = h2o_nodes.nodes[0].generate_auc(
            thresholds=None,
            actual=dataKey,
            predict='Predict.hex',
            vactual=vactual,
            vpredict=vpredict)

        auc = resultAUC['aucdata']['AUC']

        if expectedAuc:
            h2o_util.assertApproxEqual(auc, expectedAuc, tol=0.15,
                msg="actual auc: %s not close enough to %s" % (auc, expectedAuc))

    # don't do this unless binomial
    predictCMResult = h2o_nodes.nodes[0].predict_confusion_matrix(
        actual=dataKey,
        predict=predictKey,
        vactual=vactual,
        vpredict='predict',
        )

    # print "cm", dump_json(predictCMResult)

    # These will move into the h2o_gbm.py
    # if doAUC=False, means we're not binomial, and the cm is not what we expect
    if doAUC:
        cm = predictCMResult['cm']
        pctWrong = h2o_gbm.pp_cm_summary(cm);
        print h2o_gbm.pp_cm(cm)

    return predictCMResult
Example #9
0
def scoreRF(scoreParseResult,
            trainResult,
            vactual=None,
            timeoutSecs=120,
            **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    # this is how we're supposed to do scorin?
    rfModelKey = trainResult['drf_model']['_key']
    predictKey = 'Predict.hex'
    start = time.time()
    predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                       model_key=rfModelKey,
                                       destination_key=predictKey,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

    h2o_cmd.runInspect(key='Predict.hex', verbose=True)

    predictCMResult = h2o.nodes[0].predict_confusion_matrix(
        actual=parseKey,
        vactual=vactual,
        predict=predictKey,
        vpredict='predict',
        timeoutSecs=timeoutSecs,
        **kwargs)

    rftime = time.time() - start

    cm = predictCMResult['cm']

    # These will move into the h2o_gbm.py
    pctWrong = h2o_gbm.pp_cm_summary(cm)
    print "\nTest\n==========\n"
    print h2o_gbm.pp_cm(cm)
    scoreResult = predictCMResult

    rftime = time.time() - start
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
Example #10
0
    def test_GBM_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBM_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'validation': parseResult['destination_key'],
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': 10,
            'max_depth': 20,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
        print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
        # no pattern waits for all
        h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Example #11
0
    def test_GLM2_big1_nopoll(self):
        h2o.beta_features = True
        csvPathname = 'hhp_107_01.data.gz'
        print "\n" + csvPathname

        y = "106"
        x = ""
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=15)

        glmInitial = []
        # dispatch multiple jobs back to back
        start = time.time()
        for jobDispatch in range(5):
            kwargs = {'response': y, 'n_folds': 1, 'family': 'binomial'}
            # FIX! what model keys do these get?
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs)
            glmInitial.append(glm)
            print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            timeoutSecs = 200
        h2o_jobs.pollWaitJobs(pattern='GLM', timeoutSecs=timeoutSecs, retryDelaySecs=10)
        elapsed = time.time() - start
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        for g in glmInitial:
            print "Checking completed job, with no polling using initial response:"
            # this format is only in the first glm response (race?)
            modelKey = g['destination_key']
            glm = h2o.nodes[0].glm_view(_modelKey=modelKey)
            h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)

            cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]['_arr']
            print "cm:", cm
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
Example #12
0
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    # this is how we're supposed to do scorin?
    rfModelKey  = trainResult['drf_model']['_key']
    predictKey = 'Predict.hex'
    start = time.time()
    predictResult = h2o_cmd.runPredict(
        data_key=parseKey,
        model_key=rfModelKey,
        destination_key=predictKey,
        timeoutSecs=timeoutSecs, **kwargs)

    h2o_cmd.runInspect(key='Predict.hex', verbose=True)

    predictCMResult = h2o.nodes[0].predict_confusion_matrix(
        actual=parseKey,
        vactual=vactual,
        predict=predictKey,
        vpredict='predict', 
        timeoutSecs=timeoutSecs, **kwargs)
        
    rftime      = time.time()-start 

    cm = predictCMResult['cm']

    # These will move into the h2o_gbm.py
    pctWrong = h2o_gbm.pp_cm_summary(cm);
    print "\nTest\n==========\n"
    print h2o_gbm.pp_cm(cm)
    scoreResult = predictCMResult

    rftime      = time.time()-start 
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
Example #13
0
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            tryList = [(10000, 100, "cA", 300)]
        else:
            tryList = [
                # (10000, 10, 'cB', 300),
                # (10000, 50, 'cC', 300),
                (10000, 100, "cD", 300),
                (10000, 200, "cE", 300),
                (10000, 300, "cF", 300),
                (10000, 400, "cG", 300),
                (10000, 500, "cH", 300),
                (10000, 1000, "cI", 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            hdrFilename = "hdr_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"

            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            hdrPathname = SYNDATASETS_DIR + "/" + hdrFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE train****************************************
            h2o.beta_features = False  # turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = "GBMModelKey"

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(
                bucket=None,
                path=csvPathname,
                schema="put",
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                noPoll=h2o.beta_features,
                doSummary=False,
            )
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult["destination_key"] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "train parse result:", parseTrainResult["destination_key"]

            # Logging to a benchmark file
            algo = "Parse"
            l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed
            )
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult["destination_key"])
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])
            num_rows = inspect["num_rows"]
            num_cols = inspect["num_cols"]
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            ntrees = 5
            prefixList = ["A", "B", "C", "D", "E", "F", "G", "H"]
            # for max_depth in [5,10,20,40]:
            for max_depth in [5, 10, 20]:

                # PARSE a new header****************************************
                print "Creating new header", hdrPathname
                prefix = prefixList.pop(0)
                write_syn_header(hdrPathname, rowCount, colCount, prefix)

                # upload and parse the header to a hex

                h2o.beta_features = False  # can't put with fvec yet
                hdr_hex_key = prefix + "_hdr.hex"
                parseHdrResult = h2i.import_parse(
                    bucket=None,
                    path=hdrPathname,
                    schema="put",
                    header=1,  # REQUIRED! otherwise will interpret as enums
                    hex_key=hdr_hex_key,
                    timeoutSecs=timeoutSecs,
                    noPoll=h2o.beta_features,
                    doSummary=False,
                )
                # Set Column Names (before autoframe is created)
                h2o.nodes[0].set_column_names(target=hex_key, copy_from=hdr_hex_key)

                # GBM
                print "The response col name is changing each iteration, since we're parsing a new header"
                params = {
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": ntrees,
                    "max_depth": max_depth,
                    "min_rows": 10,
                    "response": prefix + "_response",
                    "ignored_cols_by_name": None,
                }

                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(
                    parseResult=parseTrainResult,
                    noPoll=h2o.beta_features,
                    timeoutSecs=timeoutSecs,
                    destination_key=modelKey,
                    **kwargs
                )
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed
                )
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView["gbm_model"]["errs"][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView["gbm_model"]["cms"][-1]["_arr"]  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

                # works if you delete the autoframe
                ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe')

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT:
            xLabel = "max_depth"
            eLabel = "pctWrong"
            fLabel = "trainElapsed"
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #14
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if h2o.localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols': 
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'], 
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict', # choices are 0 and 'predict'
                        )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)


            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #15
0
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors. 
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    GLMModel = glm['glm_model']
    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" % h2o.dump_json(glm))

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    GLMParams = GLMModel['glm']
    family = GLMParams["family"]

    # number of submodels = number of lambda
    # min of 2. lambda_max is first
    submodels = GLMModel['submodels']
    # since all our tests?? only use one lambda, the best_lamda_idx should = 1
    best_lambda_idx = GLMModel['best_lambda_idx']
    print "best_lambda_idx:", best_lambda_idx
    lambda_max = GLMModel['lambda_max']
    print "lambda_max:", lambda_max

    # currently lambda_max is not set by tomas. ..i.e.not valid
    if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value):
        raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value))

    # submodels0 = submodels[0]
    # submodels1 = submodels[-1] # hackery to make it work when there's just one

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels)))

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels)))

    submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one
    iterations = submodels1['iteration']


    print "GLMModel/iterations:", iterations

            # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations  > maxExpectedIterations:
            raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )

    if 'validation' not in submodels1:
        raise Exception("Should be a 'validation' key in submodels1: %s" % h2o.dump_json(submodels1))
    validationsList = submodels1['validation']
    validations = validationsList
        
    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    print "GLMModel/validations"        
    validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
    validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])        
    print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
    print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        best_threshold = validations['best_threshold']
        thresholds = validations['thresholds']
        print "%15s %s" % ("best_threshold:\t", best_threshold)

        # have to look up the index for the cm, from the thresholds list
        best_index = None

        # FIX! best_threshold isn't necessarily in the list. jump out if >=
        for i,t in enumerate(thresholds):
            if t >= best_threshold: # ends up using next one if not present
                best_index = i
                break
            
        assert best_index!=None, "%s %s" % (best_threshold, thresholds)
        print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

        # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
        submodels = glm['glm_model']['submodels']
        cms = submodels[0]['validation']['_cms']
        assert best_index<len(cms), "%s %s" % (best_index, len(cms))
        # if we want 0.5..rounds to int
        # mid = len(cms)/2
        # cm = cms[mid]
        cm = cms[best_index]

        print "cm:", h2o.dump_json(cm['_arr'])
        predErr = cm['_predErr']
        classErr = cm['_classErr']
        # compare to predErr
        pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
        print "predErr:", predErr
        print "calculated pctWrong from cm:", pctWrong
        print "classErr:", classErr

        # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm['_arr'])


    if family=="poisson" or family=="gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    coefficients_names = GLMModel['coefficients_names']
    # print "coefficients_names:", coefficients_names
    idxs = submodels1['idxs']
    print "idxs:", idxs
    coefficients_names = coefficients_names

    # always check both normalized and normal coefficients
    norm_beta = submodels1['norm_beta']
    # if norm_beta and len(coefficients_names)!=len(norm_beta):
    #    print len(coefficients_names), len(norm_beta)
    #    raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta))
#
    beta = submodels1['beta']
    # print "beta:", beta
    # if len(coefficients_names)!=len(beta):
    #    print len(coefficients_names), len(beta)
    #    raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta))


    # test wants to use normalized?
    if doNormalized:
        beta_used = norm_beta
    else:
        beta_used = beta

    coefficients = {}
    # create a dictionary with name, beta (including intercept) just like v1

    for i,b in zip(idxs, beta_used[:-1]):
        name = coefficients_names[i]
        coefficients[name] = b

    print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
    print  "coefficients:", coefficients
    print  "beta:", beta
    print  "norm_beta:", norm_beta

    coefficients['Intercept'] = beta_used[-1]
    print "len(coefficients_names)", len(coefficients_names)
    print "len(idxs)", len(idxs)
    print "idxs[-1]", idxs[-1]
    print "intercept demapping info:", \
        "coefficients_names[-i]:", coefficients_names[-1], \
        "idxs[-1]:", idxs[-1], \
        "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
        "beta_used[-1]:", beta_used[-1], \
        "coefficients['Intercept']", coefficients['Intercept']

    # last one is intercept
    interceptName = coefficients_names[idxs[-1]]
    if interceptName != "Intercept" or abs(beta_used[-1])<1e-26:
        raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\
            (idxs[-1], beta_used[-1], "-"+interceptName+"-"))

    # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names
    # new 5/28/14. glm can point to zero coefficients
    # for i in idxs:
    #     if beta_used[i]==0.0:
    ##        raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
    if len(idxs) > len(beta_used):
        raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used)))
    intercept = coefficients.pop('Intercept', None)

    # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
    # the last one shoudl be 'Intercept' ?
    coefficients_names.pop()

    # have to skip the output col! get it from kwargs
    # better always be there!
    y = kwargs['response']

    # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names'
    # from the response
    # Tomas created 'coefficients_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before
    # he gets it? 
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint: 
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # coefficients_names is input only now..same for header or no header, or expanded enums
    for c in coefficients_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint: 
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(coefficients_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        self.assertGreater(absXCoeff, 1e-26, (
            "abs. value of GLM coefficients['" + str(colX) + "'] is " +
            str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX)
            ))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coefficients['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept"
                ))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients)>0):
        maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey]
        minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey]
    else: 
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero, 
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients)>1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26"
            ))

    print "submodels1, run_time (milliseconds):", submodels1['run_time']

    # shouldn't have any errors
    h2o.check_sandbox_for_errors()

    return (warnings, cList, intercept)
Example #16
0
    def test_GBM_with_cancels(self):
        print "do import/parse with VA"
        h2o.beta_features = False

        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            # ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename

            ### h2o.beta_features = False

            (importResult,
             importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                              path=csvPathname,
                                              schema='local',
                                              timeoutSecs=50)
            parseResult = h2i.import_parse(
                bucket='home-0xdiag-datasets',
                path=csvPathname,
                schema='local',
                hex_key='c.hex',
                timeoutSecs=500,
                noPoll=False,
                doSummary=False
            )  # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # hack it because no response from Parse2
            if h2o.beta_features:
                parseResult = {'destination_key': 'c.hex'}

            print "\nparseResult", h2o.dump_json(parseResult)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified

            if importFolderPath == 'manyfiles-nflx-gz':
                if DO_CLASSIFICATION:
                    # need to flip the right col! (R wise)
                    execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response + 1,
                                                             response + 1)
                    kwargs = {'str': execExpr}
                    resultExec = h2o_cmd.runExec(**kwargs)

                # lets look at the response column now
                h2o.beta_features = True
                s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
                # x = range(542)
                # remove the output too! (378)
                xIgnore = []
                # BUG if you add unsorted 378 to end. remove for now
                for i in [
                        3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20,
                        424, 425, 426, 540, 541, response
                ]:
                    # have to add 1 for col start with 1, now. plus the C
                    xIgnore.append("C" + str(i + 1))
            else:
                # leave one col ignored, just to see?
                xIgnore = 'C1'

            modelKey = "GBMGood"
            params = {
                'destination_key': modelKey,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response + 1),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
            }

            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                            noPoll=True,
                                            **kwargs)
            print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
            # no pattern waits for all

            for i in range(15):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []
                for j in range(5):
                    # FIX! apparently we can't reuse a model key after a cancel
                    kwargs['destination_key'] = 'GBMBad' + str(i) + str(j)
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                                    noPoll=True,
                                                    **kwargs)
                    jobids.append(GBMFirstResult['job_key'])

                # have to pass the job id
                for j in jobids:
                    h2o.nodes[0].jobs_cancel(key=j)

            h2o_jobs.pollWaitJobs(pattern='GBMGood',
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds."

            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(
                    gbmTrainView['gbm_model']['errs'])

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename,
                                                   importResult=importResult)
    def test_GBM_params_rand2(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C54', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", trainKey

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", testKey

            paramsDict = define_gbm_params()
            for trial in range(3):
                # use this to set any defaults you want if the pick doesn't set
                print "Regression!"
                params = {'response': 'C54', 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0}
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=testKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
                print "FIX! where do we get the summary info on the test data after predict?"
Example #18
0
    def test_c10_rel_gbm(self):
        print "not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Test***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        testFilename = 'classification1Test.txt'
        testPathname = importFolderPath + "/" + testFilename

        start = time.time()
        parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        trainFilename = 'classification1Train.txt'
        trainPathname = importFolderPath + "/" + trainFilename

        start = time.time()
        parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', 
            timeoutSecs=500, doSummary=True)
        print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, trainPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # GBM Train***********************************************************
        x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
        # response = 0
        # doesn't work if index is used?
        response = 'outcome'

        # x = range(inspect['num_cols'])
        # del x[response]
        ntrees = 10
        # fails with 40
        params = {
            'learn_rate': .2,
            'nbins': 1024,
            'ntrees': ntrees,
            'max_depth': 20,
            'min_rows': 2,
            'response': response,
            'cols': x,
            # 'ignored_cols_by_name': None,
        }
        print "Using these parameters for GBM: ", params
        kwargs = params.copy()
        modelKey = 'GBMModelKey'

        timeoutSecs = 900

        trainStart = time.time()
        gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
            timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
        trainElapsed = time.time() - trainStart
        print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        # errrs from end of list? is that the last tree?
        errsLast = gbmTrainView['gbm_model']['errs'][-1]
        print "GBM 'errsLast'", errsLast

        # get the last cm
        cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
        pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
        print "Last line of this cm might be NAs, not CM"
        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm)

        # GBM test****************************************
        predictKey = 'Predict.hex'
        h2o_cmd.runInspect(key=parseTestResult['destination_key'])
        start = time.time()
        gbmTestResult = h2o_cmd.runPredict(
            data_key=parseTestResult['destination_key'],
            model_key=modelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename


        if DO_PREDICT_CM:
            gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                actual=parseTestResult['destination_key'],
                vactual='predict',
                predict=predictKey,
                vpredict='predict', # choices are 7 (now) and 'predict'
                )

            # errrs from end of list? is that the last tree?
            # all we get is cm
            cm = gbmPredictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "Last line of this cm is really NAs, not CM"
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
Example #19
0
    def test_GBM_params_rand2(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                params = {
                    'response': 54, 
                    'ignored_cols_by_name': 
                    '0,1,2,3,4', 
                    'ntrees': 2,
                    'validation': parseTestResult['destination_key'],
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                if DO_PREDICT_CM:
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual='predict',
                        predict=predictKey,
                        vpredict='predict', # choices are 7 (now) and 'predict'
                        )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cms'][-1] # use the last one

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                if 'max_depth' in params and params['max_depth']:
                    xList.append(params['max_depth'])
                    eList.append(pctWrongTrain)
                    fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrongTrain'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #20
0
    def test_GLM2_mnist(self):
        if not SCIPY_INSTALLED:
            pass

        else:    
            h2o.beta_features = True
            SYNDATASETS_DIR = h2o.make_syn_dir()

            csvFilelist = [
                (10000, 500, 'cA', 60),
            ]

            trial = 0
            for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist:
                trialStart = time.time()

                # PARSE test****************************************
                csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='put', 
                    hex_key=hex_key, timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # GLM****************************************
                modelKey = 'GLM_model'
                y = colCount 
                kwargs = {
                    'response': 'C' + str(y+1),
                    'family': 'binomial',
                    'lambda': 1e-4, 
                    'alpha': 0,
                    'max_iter': 15,
                    'n_folds': 1,
                    'beta_epsilon': 1.0E-4,
                    'destination_key': modelKey,
                    }

                # GLM wants the output col to be strictly 0,1 integer
                execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (hex_key, y+1, y+1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                aHack = {'destination_key': 'aHack'}

                
                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                lambdaMax = glm['glm_model']['lambda_max']
                print "lambdaMax:", lambdaMax

                best_threshold= glm['glm_model']['submodels'][0]['validation']['best_threshold']
                print "best_threshold", best_threshold

                # pick the middle one?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                print "\nPredict\n==========\n"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key='aHack',
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='aHack',
                    vactual='C' + str(y+1),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                self.assertLess(pctWrong, 50,"Should see less than 50% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
    def test_c9_GLM_airlines_hdfs(self):
        files = [
                 ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda': 1e-8,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 3,
                'family': 'binomial',
                'destination_key': "GLMKEY",
                'response': response,
                'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']

            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']
            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i,t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
            # FIX! should look at prediction error/class error?
            # self.assertLess(pctWrong, 9,"Should see less than 40% error")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm['_arr'])

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=trainKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=trainKey,
                vactual=response,
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']
            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            # self.assertLess(pctWrong, 40,"Should see less than 40% error")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)


        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Example #22
0
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1)

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response+1),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response+1),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #23
0
    def test_GLM2_covtype_train_predict_all_all(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=180)

        execExpr = "A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1)  # class 1
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "Use same data (full) for train and test"
        trainDataKey = "A.hex"
        testDataKey = "A.hex"
        # start at 90% rows + 1

        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y + 1),
            'max_iter': 20,
            'n_folds': 0,
            # 'alpha': 0.1,
            # 'lambda': 1e-5,
            'alpha': 0.0,
            'lambda': None,
            'family': 'binomial',
        }
        timeoutSecs = 60

        for trial in range(1):
            # test/train split **********************************************8
            aHack = {'destination_key': trainDataKey}

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']
            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']

            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i, t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=testDataKey,
                                               model_key=modelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y + 1),
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            self.assertEqual(
                pctWrong, trainPctWrong,
                "Should see the same error rate on train and predict? (same data set)"
            )

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Example #24
0
    def test_GBM_with_cancels(self):
        print "do import/parse with VA"
        h2o.beta_features = False

        importFolderPath = "standard"
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            # ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename

            ### h2o.beta_features = False

            (importResult, importPattern) = h2i.import_only(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="local", timeoutSecs=50
            )
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path=csvPathname,
                schema="local",
                hex_key="c.hex",
                timeoutSecs=500,
                noPoll=False,
                doSummary=False,
            )  # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # hack it because no response from Parse2
            if h2o.beta_features:
                parseResult = {"destination_key": "c.hex"}

            print "\nparseResult", h2o.dump_json(parseResult)

            print "Parse result['destination_key']:", parseResult["destination_key"]
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified

            if importFolderPath == "manyfiles-nflx-gz":
                if DO_CLASSIFICATION:
                    # need to flip the right col! (R wise)
                    execExpr = "c.hex[,%s]=c.hex[,%s]>15" % (response + 1, response + 1)
                    kwargs = {"str": execExpr}
                    resultExec = h2o_cmd.runExec(**kwargs)

                # lets look at the response column now
                h2o.beta_features = True
                s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
                # x = range(542)
                # remove the output too! (378)
                xIgnore = []
                # BUG if you add unsorted 378 to end. remove for now
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]:
                    # have to add 1 for col start with 1, now. plus the C
                    xIgnore.append("C" + str(i + 1))
            else:
                # leave one col ignored, just to see?
                xIgnore = "C1"

            modelKey = "GBMGood"
            params = {
                "destination_key": modelKey,
                "ignored_cols_by_name": xIgnore,
                "learn_rate": 0.1,
                "ntrees": 2,
                "max_depth": 8,
                "min_rows": 1,
                "response": "C" + str(response + 1),
                "classification": 1 if DO_CLASSIFICATION else 0,
                "grid_parallelism": 4,
            }

            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
            print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
            # no pattern waits for all

            for i in range(20):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []
                for j in range(5):
                    # FIX! apparently we can't reuse a model key after a cancel
                    kwargs["destination_key"] = "GBMBad" + str(i) + str(j)
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
                    jobids.append(GBMFirstResult["job_key"])

                # have to pass the job id
                for j in jobids:
                    h2o.nodes[0].jobs_cancel(key=j)

            h2o_jobs.pollWaitJobs(pattern="GBMGood", timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds."

            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView["gbm_model"]["errs"][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView["gbm_model"]["cms"][-1]["_arr"]  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
Example #25
0
    def test_RF_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]

        tryList = [
            (10000, 100, 'cA', 300),
            (10000, 300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            # (10000,  3000, 'cK', 3600),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'RFModelKey'

            # Parse (train)****************************************
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=None,
                                                path=csvPathname,
                                                schema='put',
                                                header=0,
                                                hex_key=hex_key,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # RF(train iterate)****************************************
            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'nbins': 1024,
                    'classification': 1,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(numCols - 1),
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for RF: ", params
                kwargs = params.copy()

                trainStart = time.time()
                rfResult = h2o_cmd.runRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs,
                                         destination_key=modelKey,
                                         **kwargs)
                trainElapsed = time.time() - trainStart
                print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(
                    max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                errsLast = rfResult['drf_model']['errs'][-1]
                print "RF 'errsLast'", errsLast

                cm = rfResult['drf_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Example #26
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'standard/covtype.shuffled.90pct.data'
        csvPathname_test = 'standard/covtype.shuffled.10pct.data'
        hex_key = 'covtype.hex'
        validation_key = 'covtype.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname_train,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets',
                                        path=csvPathname_test,
                                        schema='local',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'adaptive_rate': 0,
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 96.0,
            'destination_key': model_key,
            'validation': validation_key,
            'score_interval': 10000
        }
        expectedErr = 0.24  ## expected validation error for the above model
        relTol = 0.20  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Example #27
0
    def test_DeepLearning_mnist(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '1024,1024,2048',
            'adaptive_rate'                : 1,
            'rho'                          : 0.99,
            'epsilon'                      : 1e-8,
            'train_samples_per_iteration'  : -1, ## 0: better accuracy!  -1: best scalability!  10000: best accuracy?
#            'rate'                         : 0.01,
#            'rate_annealing'               : 1e-6,
#            'momentum_start'               : 0.5,
#            'momentum_ramp'                : 1800000,
#            'momentum_stable'              : 0.99,
            'l1'                           : 1e-5,
            'l2'                           : 0.0,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            'epochs'                       : 128, #enough for 64 nodes
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000 #don't score until the end
            }

        timeoutSecs = 7200
        start = time.time()
        deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        h2o.beta_features = True
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)

        h2o.beta_features = False
    def test_RF_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        tryList = [
            (10000, 100, 'cA', 300),
            (10000, 300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            (10000, 3000, 'cK', 3600),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'RFModelKey'

            # Parse (train)****************************************
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0,
                                                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # RF(train iterate)****************************************
            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'nbins': 1024,
                    'classification': 1,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'response': 'C' + str(numCols-1),
                    'ignored_cols_by_name': None,
                    }

                print "Using these parameters for RF: ", params
                kwargs = params.copy()

                trainStart = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)
                rfResult["drf_model"] = rfResult.pop("speedrf_model")
                errsLast = rfResult['drf_model']['errs'][-1]
                print "RF 'errsLast'", errsLast

                cm = rfResult['drf_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #29
0
    def test_c9_GLM_rc_fvec(self):
        h2o.beta_features = True

        files = [
                 ('c16', '140k_train_anonymised.csv', 'rc.hex', 1800,  None)
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            # avoid printing the coefficient names in jenkins output
            # the last col is the response, so we use a number to point to it below
            parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path=csvPathname, schema='local', hex_key=trainKey, 
                header=0, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            response = numCols-1

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda': 1e-8,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 0,
                'family': 'binomial',
                'destination_key': "GLMKEY",
                'response': response,
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if h2o.beta_features:
                modelKey = glm['glm_model']['_key']

                submodels = glm['glm_model']['submodels']
                # hackery to make it work when there's just one
                validation = submodels[-1]['validation']
                best_threshold = validation['best_threshold']
                thresholds = validation['thresholds']
                # have to look up the index for the cm, from the thresholds list
                best_index = None
                for i,t in enumerate(thresholds):
                    if t == best_threshold:
                        best_index = i
                        break
                cms = validation['_cms']
                cm = cms[best_index]
                pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
                # FIX! should look at prediction error/class error?
                # self.assertLess(pctWrong, 9,"Should see less than 40% error")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm['_arr'])

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key=trainKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=trainKey,
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']
                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 40,"Should see less than 40% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)


        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Example #30
0
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
    def test_GLM2_covtype20x_train(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180)

        execExpr="A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        # Split Test/Train************************************************
        # how many rows for each pct?
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        trainDataKey = "rTrain"
        testDataKey = "rTest"
        # start at 90% rows + 1
        
        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y),
            'max_iter': 20, 
            'n_folds': 0, 
            'alpha': 0.1, 
            'lambda': 1e-5, 
            'family': 'binomial',
            'classification': 1,
        }
        timeoutSecs = 60

        for trial in range(100):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 

            # test/train split **********************************************8
            h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
            aHack = {'destination_key': trainDataKey}
            parseKey = trainDataKey

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            modelKey = glm['glm_model']['_key']

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y),
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
Example #32
0
    def test_DeepLearning_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue    = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse   = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" 

        twoValueList = [
            ('A','B',0, 14),
            ('A','B',1, 14),
            (0,1,0, 12),
            (0,1,1, 12),
            (0,1,'NaN', 12),
            (1,0,'NaN', 12),
            (-1,1,0, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'
            validation_key = hex_key

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols']
            response = 'C' + str(response)

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'activation'                   : 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden'                       : '113,71,54',
                'rate'                         : 0.01,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0,
                'momentum_stable'              : 0,
                'l1'                           : 0.0,
                'l2'                           : 1e-6,
                'seed'                         : 80023842348,
                'loss'                         : 'CrossEntropy',
                #'max_w2'                       : 15,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 100,
                'destination_key'              : model_key,
                'validation'                   : hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time() - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.00
            relTol = 0.01
            predict_key = 'Predict.hex'

            kwargs = {
                'data_key': validation_key,
                'destination_key': predict_key,
                'model_key': model_key
            }
            predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
            h2o_cmd.runInspect(key=predict_key, verbose=True)

            kwargs = {
            }

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=validation_key,
                vactual=response,
                predict=predict_key,
                vpredict='predict',
                timeoutSecs=timeoutSecs, **kwargs)

            cm = predictCMResult['cm']

            print h2o_gbm.pp_cm(cm)
            actualErr = h2o_gbm.pp_cm_summary(cm)/100.

            print "actual   classification error:" + format(actualErr)
            print "expected classification error:" + format(expectedErr)
            if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
                raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                                (actualErr, float(relTol)*100, expectedErr))


            trial += 1
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(num_cols)
            del x[response]
            ignored_cols_by_name = ",".join(map(str,random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            


                if FORCE_FAIL_CASE:
                    params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} 

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                print "This is crazy!"
                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #34
0
    def test_GLM2_mnist(self):
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema=schema,
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs)

            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTestResult['destination_key'],
                timeoutSecs=300,
                returnIgnoreX=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema=schema,
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTrainResult['destination_key'],
                timeoutSecs=300,
                returnIgnoreX=True)
            print "ignoreX:", ignoreX

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX,
                'response': 'C' + str(y + 1),
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
            }

            if DO_ALL_DIGITS:
                cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                if DO_BUG:
                    execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (
                        trainKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "A.hex=%s" % (trainKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                if DO_BUG:
                    execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (
                        testKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "B.hex=%s" % (testKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack,
                                                timeoutSecs=timeoutSecs,
                                                pollTimeoutSecs=60,
                                                noPoll=True,
                                                **kwargs)
                print "\nglmFirstResult:", h2o.dump_json(glmFirstResult)
                job_key = glmFirstResult['job_key']
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                            pollTimeoutSecs=60,
                                            retryDelaySecs=5)

                # double check...how come the model is bogus?
                h2o_jobs.pollWaitJobs()
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][
                    -1]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='B.hex',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='B.hex',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 9,
                                "Should see less than 9% error (class = 4)")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Example #35
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz',
                 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex'
                 )
            ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            h2o.beta_features = False  #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='s3n',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                noPoll=h2o.beta_features,
                                                doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               noPoll=h2o.beta_features,
                                               doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey,
                                                           testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols':
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'],
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    # hack
                    if h2o.beta_features:
                        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                         pollTimeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict',  # choices are 0 and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)

            h2o.beta_features = False

            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                                  fListTitle, fList, fLabel)
Example #36
0
def simpleCheckGLM(self,
                   glm,
                   colX,
                   allowFailWarning=False,
                   allowZeroCoeff=False,
                   prettyPrint=False,
                   noPrint=False,
                   maxExpectedIterations=None,
                   doNormalized=False,
                   **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors.
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    if h2o.beta_features:
        GLMModel = glm['glm_model']
    else:
        GLMModel = glm['GLMModel']

    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" %
                        h2o.dump_json(glm))

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x, w) and not allowFailWarning:
                if re.search(c, w):
                    # ignore the fail to converge warning now
                    pass
                else:
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    if h2o.beta_features:
        GLMParams = GLMModel['glm']
    else:
        GLMParams = GLMModel["GLMParams"]

    family = GLMParams["family"]

    if h2o.beta_features:
        # number of submodels = number of lambda
        # min of 2. lambda_max is first
        submodels = GLMModel['submodels']
        lambdas = GLMModel['lambdas']
        # since all our tests?? only use one lambda, the best_lamda_idx should = 1
        best_lambda_idx = GLMModel['best_lambda_idx']
        print "best_lambda_idx:", best_lambda_idx
        lambda_max = GLMModel['lambda_max']
        print "lambda_max:", lambda_max

        # currently lambda_max is not set by tomas. ..i.e.not valid
        if 1 == 0 and lambda_max <= lambdas[best_lambda_idx]:
            raise Exception(
                "lambda_max %s should always be > the lambda result %s we're checking"
                % (lambda_max, lambdas[best_lambda_idx]))

        # submodels0 = submodels[0]
        # submodels1 = submodels[-1] # hackery to make it work when there's just one

        if (best_lambda_idx >= len(lambdas)) or (best_lambda_idx < 0):
            raise Exception(
                "best_lambda_idx: %s should point to one of lambdas (which has len %s)"
                % (best_lambda_idx, len(lambdas)))

        if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
            raise Exception(
                "best_lambda_idx: %s should point to one of submodels (which has len %s)"
                % (best_lambda_idx, len(submodels)))

        submodels1 = submodels[
            best_lambda_idx]  # hackery to make it work when there's just one
        iterations = submodels1['iteration']

    else:
        iterations = GLMModel['iterations']

    print "GLMModel/iterations:", iterations

    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations > maxExpectedIterations:
        raise Exception(
            "Convergence issue? GLM did iterations: %d which is greater than expected: %d"
            % (iterations, maxExpectedIterations))

    if h2o.beta_features:
        if 'validation' not in submodels1:
            raise Exception("Should be a 'validation' key in submodels1: %s" %
                            h2o.dump_json(submodels1))
        validationsList = submodels1['validation']
        validations = validationsList

    else:
        # pop the first validation from the list
        if 'validations' not in GLMModel:
            raise Exception("Should be a 'validations' key in GLMModel: %s" %
                            h2o.dump_json(GLMModel))
        validationsList = GLMModel['validations']
        # don't want to modify validationsList in case someone else looks at it
        validations = validationsList[0]

    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    # not checked in v2?
    if not h2o.beta_features:
        if not 'xval_models' in validations:
            if n_folds > 1:
                raise Exception(
                    "No cross validation models returned. Asked for " +
                    n_folds)
        else:
            xval_models = validations['xval_models']
            if n_folds and n_folds > 1:
                if len(xval_models) != n_folds:
                    raise Exception(
                        len(xval_models) +
                        " cross validation models returned. Asked for " +
                        n_folds)
            else:
                # should be default 10?
                if len(xval_models) != 10:
                    raise Exception(
                        str(len(xval_models)) +
                        " cross validation models returned. Default should be 10"
                    )

    if h2o.beta_features:
        print "GLMModel/validations"
        validations['null_deviance'] = h2o_util.cleanseInfNan(
            validations['null_deviance'])
        validations['residual_deviance'] = h2o_util.cleanseInfNan(
            validations['residual_deviance'])
        print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
        print "%15s %s" % ("residual_deviance:\t",
                           validations['residual_deviance'])

    else:
        print "GLMModel/validations"
        validations['err'] = h2o_util.cleanseInfNan(validations['err'])
        validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
        validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
        print "%15s %s" % ("err:\t", validations['err'])
        print "%15s %s" % ("nullDev:\t", validations['nullDev'])
        print "%15s %s" % ("resDev:\t", validations['resDev'])

    # threshold only there if binomial?
    # auc only for binomial
    if family == "binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        if h2o.beta_features:
            best_threshold = validations['best_threshold']
            thresholds = validations['thresholds']
            print "%15s %s" % ("best_threshold:\t", best_threshold)

            # have to look up the index for the cm, from the thresholds list
            best_index = None

            # FIX! best_threshold isn't necessarily in the list. jump out if >=
            for i, t in enumerate(thresholds):
                if t >= best_threshold:  # ends up using next one if not present
                    best_index = i
                    break

            assert best_index != None, "%s %s" % (best_threshold, thresholds)
            print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

            # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
            submodels = glm['glm_model']['submodels']
            cms = submodels[0]['validation']['_cms']
            assert best_index < len(cms), "%s %s" % (best_index, len(cms))
            # if we want 0.5..rounds to int
            # mid = len(cms)/2
            # cm = cms[mid]
            cm = cms[best_index]

            print "cm:", h2o.dump_json(cm['_arr'])
            predErr = cm['_predErr']
            classErr = cm['_classErr']
            # compare to predErr
            pctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])
            print "predErr:", predErr
            print "calculated pctWrong from cm:", pctWrong
            print "classErr:", classErr

            # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm['_arr'])
        else:
            print "%15s %s" % ("threshold:\t", validations['threshold'])

    if family == "poisson" or family == "gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    if not h2o.beta_features:
        if math.isnan(validations['err']):
            emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t",
                                                         validations['err'])
            raise Exception(emsg)

        if math.isnan(validations['resDev']):
            emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                "resDev:\t", validations['resDev'])
            raise Exception(emsg)

        # legal?
        if math.isnan(validations['nullDev']):
            pass

    # get a copy, so we don't destroy the original when we pop the intercept
    if h2o.beta_features:
        coefficients_names = GLMModel['coefficients_names']
        # print "coefficients_names:", coefficients_names
        idxs = submodels1['idxs']
        print "idxs:", idxs
        column_names = coefficients_names

        # always check both normalized and normal coefficients
        norm_beta = submodels1['norm_beta']
        # if norm_beta and len(column_names)!=len(norm_beta):
        #    print len(column_names), len(norm_beta)
        #    raise Exception("column_names and normalized_norm_beta from h2o json not same length. column_names: %s normalized_norm_beta: %s" % (column_names, norm_beta))
        #
        beta = submodels1['beta']
        # print "beta:", beta
        # if len(column_names)!=len(beta):
        #    print len(column_names), len(beta)
        #    raise Exception("column_names and beta from h2o json not same length. column_names: %s beta: %s" % (column_names, beta))

        # test wants to use normalized?
        if doNormalized:
            beta_used = norm_beta
        else:
            beta_used = beta

        coefficients = {}
        # create a dictionary with name, beta (including intercept) just like v1

        for i, b in zip(idxs, beta_used[:-1]):
            name = coefficients_names[i]
            coefficients[name] = b

        print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
        print "coefficients:", coefficients
        print "beta:", beta
        print "norm_beta:", norm_beta

        coefficients['Intercept'] = beta_used[-1]
        print "intercept demapping info:", \
            "column_names[-i]:", column_names[-1], \
            "idxs[-1]:", idxs[-1], \
            "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
            "beta_used[-1]:", beta_used[-1], \
            "coefficients['Intercept']", coefficients['Intercept']

        # last one is intercept
        interceptName = coefficients_names[idxs[-1]]
        if interceptName != "Intercept" or abs(beta_used[-1]) < 1e-26:
            raise Exception("'Intercept' should be last in coefficient_names and beta %s %s %s" %\
                (idxs[-1], beta_used[-1], "-"+interceptName+"-"))

        # idxs has the order for non-zero coefficients, it's shorter than beta_used and column_names
        # new 5/28/14. glm can point to zero coefficients
        # for i in idxs:
        #     if beta_used[i]==0.0:
        ##        raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
        if len(idxs) > len(beta_used):
            raise Exception("idxs shouldn't be longer than beta_used %s %s" %
                            (len(idxs), len(beta_used)))
        intercept = coefficients.pop('Intercept', None)

        # intercept demapping info: idxs[-1]: 54 coefficient_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
        # the last one shoudl be 'Intercept' ?
        column_names.pop()

    else:
        if doNormalized:
            coefficients = GLMModel['normalized_coefficients'].copy()
        else:
            coefficients = GLMModel['coefficients'].copy()
        column_names = GLMModel['column_names']
        # get the intercept out of there into it's own dictionary
        intercept = coefficients.pop('Intercept', None)
        print "First intercept:", intercept

    # have to skip the output col! get it from kwargs
    # better always be there!
    if h2o.beta_features:
        y = kwargs['response']
    else:
        y = kwargs['y']

    # the dict keys are column headers if they exist...how to order those? new: use the 'column_names'
    # from the response
    # Tomas created 'column_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'column_names' if dropped by GLMQuery before
    # he gets it?
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint:
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # column_names is input only now..same for header or no header, or expanded enums
    for c in column_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint:
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(column_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        self.assertGreater(
            absXCoeff, 1e-26,
            ("abs. value of GLM coefficients['" + str(colX) + "'] is " +
             str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX)))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26,
                       ("abs. value of GLM coefficients['Intercept'] is " +
                        str(absIntercept) + ", not >= 1e-26 for Intercept"))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients) > 0):
        maxKey = max([(abs(coefficients[x]), x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[
            maxKey]
        minKey = min([(abs(coefficients[x]), x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[
            minKey]
    else:
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero,
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients) > 1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(
            s, 1e-26, ("sum of abs. value of GLM coefficients/intercept is " +
                       str(s) + ", not >= 1e-26"))

    if h2o.beta_features:
        print "submodels1, run_time (milliseconds):", submodels1['run_time']
    else:

        print "GLMModel model time (milliseconds):", GLMModel['model_time']
        print "GLMModel validation time (milliseconds):", validations[
            'val_time']
        print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']

    # shouldn't have any errors
    h2o.check_sandbox_for_errors()

    return (warnings, cList, intercept)
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        if h2o.beta_features:
            kwargs = {'str': execExpr, 'timeoutSecs': 15}
        else:
            kwargs = {'expression': execExpr, 'timeoutSecs': 15}

        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        if h2o.beta_features:
            paramDict = drf2ParamDict
            params = {
                'ntrees': 20, 
                'destination_key': 'RF_model'
            }
        else:
            paramDict = drf1ParamDict
            params = {
                'ntree': 20, 
                'out_of_bag_error_estimate': 1, 
                'model_key': 'RF_model'
            }

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        if h2o.beta_features:
            timeoutSecs = 30 + kwargs['ntrees'] * 60
        else:
            timeoutSecs = 30 + kwargs['ntree'] * 60 

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        if h2o.beta_features:
            model_key = kwargs['destination_key']
            ntree = kwargs['ntrees']
        else:
            model_key = kwargs['model_key']
            ntree = kwargs['ntree']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None, dataKeyTest, 
                model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(classification_error, 50, delta=50, 
                msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey  = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=parseKey,
                model_key=rfModelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C54',
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Example #38
0
    def test_GLM2_mnist_short(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        bucket = 'home-0xdiag-datasets'
        schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema=schema,
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"

            # first col is pixel value ..use 0 here
            y = 0
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTestResult['destination_key'],
                timeoutSecs=300,
                forRF=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema=schema,
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTrainResult['destination_key'],
                timeoutSecs=300,
                forRF=True)
            print "ignoreX:", ignoreX

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX,
                # first column is pixel value
                'response': 'C' + str(y + 1),
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
            }

            cases = [8]
            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                execExpr = "A.hex=%s" % (trainKey)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2o_cmd.runSummary(key=trainKey,
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)
                h2o_cmd.runSummary(key='A.hex',
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)

                execExpr = "B.hex=%s" % (testKey)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2o_cmd.runSummary(key=testKey,
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)
                h2o_cmd.runSummary(key='B.hex',
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack,
                                                timeoutSecs=timeoutSecs,
                                                pollTimeoutSecs=60,
                                                noPoll=True,
                                                **kwargs)
                print "\nglmFirstResult:", h2o.dump_json(glmFirstResult)
                job_key = glmFirstResult['job_key']
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                            pollTimeoutSecs=60,
                                            retryDelaySecs=5)

                # double check...how come the model is bogus?
                h2o_jobs.pollWaitJobs()
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                    (elapsed * 100) / timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][
                    -1]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
Example #39
0
    def test_NN_airlines_small(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'airlines_train.hex'
        validation_key = 'airlines_test.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        # this gives the last col number, which is IsDepDelayed_REC (1 or -1)
        # response = inspect['numCols'] - 1

        # this is "YES"/"NO"
        response = 'IsDepDelayed'

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # get the column names
        colNames = [c['name'] for c in inspect['cols']]
        print "colNames:", colNames
        usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier",
                    "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance")

        ignoredCols = []
        for c in colNames:
            # don't put the response in the ignore list (is there a problem if so?)
            if c not in usedCols and c != response:
                ignoredCols.append(c)

        ignoredColsString = ",".join(ignoredCols)
        print "Telling h2o to ignore these cols:"
        print ignoredColsString

        kwargs = {
            'ignored_cols': ignoredColsString,
            'response': response,
            'classification': 1,
            'destination_key': model_key,
        }
        expectedErr = 0.45  ## expected validation error for the above model
        relTol = 0.50  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Example #40
0
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        kwargs = {'str': execExpr, 'timeoutSecs': 15}
        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        paramDict = drf2ParamDict
        params = {'ntrees': 20, 'destination_key': 'RF_model'}

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        timeoutSecs = 30 + kwargs['ntrees'] * 60

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs,
                           retryDelaySecs=1,
                           **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntree = kwargs['ntrees']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None,
                          dataKeyTrain,
                          model_key,
                          ntree=ntree,
                          timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree=ntree,
                                       timeoutSecs=timeoutSecs,
                                       out_of_bag_error_estimate=0,
                                       retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(
                classification_error,
                50,
                delta=50,
                msg="Classification error %s differs too much" %
                classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                               model_key=rfModelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C55',
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Example #41
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '117,131,129',
            'adaptive_rate'                : 0,
            'rate'                         : 0.005,
            'rate_annealing'               : 1e-6,
            'momentum_start'               : 0.5,
            'momentum_ramp'                : 100000,
            'momentum_stable'              : 0.9,
            'l1'                           : 0.00001,
            'l2'                           : 0.0000001,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs'                       : 2.0,
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000
            }
        expectedErr = 0.057 ## expected validation error for the above model
        relTol = 0.20 ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
            raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                            (actualErr, float(relTol)*100, expectedErr))
Example #42
0
    def test_GLM2_mnist_reals(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz",    600), 
        ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put',
                hex_key=testKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put',
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM****************************************
            print "This is the pruned x GLM will use"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x

            modelKey = "mnist"
            params = {
                'response': y,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey
                }

            # for c in [0,1,2,3,4,5,6,7,8,9]:
            # just do a couple digits
            for c in [0,7]:
                print "Trying binomial with case:", c
                execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                kwargs = params.copy()

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)

               # Score **********************************************
                execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                print "Problems with test data having different enums than train? just use train for now"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key="B.hex",
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual="B.hex",
                    vactual='C' + str(y+1),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong)

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
    def test_GBM_covtype_train_test(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        # h2b.browseTheCloud()

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            ntrees = 2
            # fails with 40
            for max_depth in [40, 5]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 7 (now) and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #44
0
    def test_NN2_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 90
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 20.0,
            'destination_key': model_key,
            'validation': validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03  ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.046
        relTol = 0.1
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Example #45
0
    def test_GBM_with_cancels(self):

        print "Sets h2o.beta_features like -bf at command line"
        print "this will redirect import and parse to the 2 variants"
        h2o.beta_features = True

        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            # ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            
            ### h2o.beta_features = False

            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', 
                timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # hack it because no response from Parse2
            if h2o.beta_features:
                parseResult = {'destination_key': 'c.hex'}

            print "\nparseResult", h2o.dump_json(parseResult)

            print "Parse result['destination_key']:", parseResult['destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified

            if importFolderPath=='manyfiles-nflx-gz':
                if DO_CLASSIFICATION:
                    # need to flip the right col! (R wise)
                    execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1)
                    kwargs = { 'str': execExpr }
                    resultExec = h2o_cmd.runExec(**kwargs)

                # lets look at the response column now
                s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
                x = range(542)
                # remove the output too! (378)
                xIgnore = []
                # BUG if you add unsorted 378 to end. remove for now
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]:
                    if i not in x:
                        print "x:", x
                        print 'missing?', i
                    x.remove(i)
                    xIgnore.append(i)

                x = ",".join(map(str,x))
                def colIt(x): return "C" + str(x)
                xIgnore = ",".join(map(colIt, xIgnore))
            else:
                # leave one col ignored, just to see?
                xIgnore = 0

            modelKey = "GBMGood"
            params = {
                'destination_key': modelKey,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
                }

            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
            print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
            # no pattern waits for all

            for i in range(20):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []     
                for j in range(5):
                    kwargs['destination_key'] = 'GBMBad' + str(j)
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
                    jobids.append(GBMFirstResult['job_key'])

                # have to pass the job id
                for j in jobids:
                    h2o.nodes[0].jobs_cancel(key=j)


            h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds."

            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
Example #46
0
    def test_GLM2_mnist(self):
        if not SCIPY_INSTALLED:
            pass

        else:
            SYNDATASETS_DIR = h2o.make_syn_dir()

            csvFilelist = [
                (10000, 500, 'cA', 60),
            ]

            trial = 0
            for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist:
                trialStart = time.time()

                # PARSE test****************************************
                csvFilename = 'syn_' + "binary" + "_" + str(
                    rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # GLM****************************************
                modelKey = 'GLM_model'
                y = colCount
                kwargs = {
                    'response': 'C' + str(y + 1),
                    'family': 'binomial',
                    'lambda': 1e-4,
                    'alpha': 0,
                    'max_iter': 15,
                    'n_folds': 1,
                    'beta_epsilon': 1.0E-4,
                    'destination_key': modelKey,
                }

                # GLM wants the output col to be strictly 0,1 integer
                execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (
                    hex_key, y + 1, y + 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                aHack = {'destination_key': 'aHack'}

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=60,
                                     **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                lambdaMax = glm['glm_model']['lambda_max']
                print "lambdaMax:", lambdaMax

                best_threshold = glm['glm_model']['submodels'][0][
                    'validation']['best_threshold']
                print "best_threshold", best_threshold

                # pick the middle one?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][
                    '_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                print "\nPredict\n==========\n"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='aHack',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='aHack',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 50, "Should see less than 50% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            tryList = [
                (100000, 400, 'cA', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (100000, 100, 'cD', 300), 
                (100000, 200, 'cE', 300), 
                (100000, 500, 'cG', 300), 
                (100000, 1000, 'cI', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            h2o.beta_features = False
            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            # l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                # len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            l = '{:d} jvms, {:d}MB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            # was failing with 100 trees
            # ntrees = 100
            # for max_depth in [5,10,20,40]:
            ntrees = 10
            for max_depth in [5]:
                params = {
                    'learn_rate': .2,
                    'nbins': 10, # 1024 fail
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': num_cols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #48
0
    def test_GBM_params_rand2(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
            # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
            ('standard', 'covtype.shuffled.10pct.sorted.data',
             'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data',
             'covtype.test.hex')
        ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=importFolderPath + "/" +
                                                trainFilename,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(
                key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                params = {
                    'response': 54,
                    'ignored_cols_by_name': 'C1,C2,C3,C4,C5',
                    'ntrees': 2,
                    'validation': parseTestResult['destination_key'],
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'],
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                if DO_PREDICT_CM:
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual='predict',
                        predict=predictKey,
                        vpredict='predict',  # choices are 7 (now) and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                if 'max_depth' in params and params['max_depth']:
                    xList.append(params['max_depth'])
                    eList.append(pctWrongTrain)
                    fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrongTrain'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Example #49
0
    def test_c9_GLM_airlines_fvec(self):
        h2o.beta_features = True

        files = [('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800,
                  'IsDepDelayed')]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda':
                1e-8,
                'alpha':
                0.0,
                'max_iter':
                30,
                'n_folds':
                3,
                'family':
                'binomial',
                'destination_key':
                "GLMKEY",
                'response':
                response,
                'ignored_cols':
                'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if h2o.beta_features:
                modelKey = glm['glm_model']['_key']

                submodels = glm['glm_model']['submodels']
                # hackery to make it work when there's just one
                validation = submodels[-1]['validation']
                best_threshold = validation['best_threshold']
                thresholds = validation['thresholds']
                # have to look up the index for the cm, from the thresholds list
                best_index = None
                for i, t in enumerate(thresholds):
                    if t == best_threshold:
                        best_index = i
                        break
                cms = validation['_cms']
                cm = cms[best_index]
                pctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])
                # FIX! should look at prediction error/class error?
                # self.assertLess(pctWrong, 9,"Should see less than 40% error")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm['_arr'])

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key=trainKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=trainKey,
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']
                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 40,"Should see less than 40% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Example #50
0
    def test_GBM_poker_1m(self):
        for trial in range(2):
            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'
            timeoutSecs = 900
            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            csvPathname = 'poker/poker-hand-testing.data'
            hex_key = 'poker-hand-testing.data.hex'
            parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 2
            for max_depth in [5,10,20]:
                params = {
                    'learn_rate': .1,
                    'nbins': 10,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': num_cols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #51
0
    def test_GLM2_mnist(self):
        h2o.beta_features = True
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs)
            
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True)
            print "ignoreX:", ignoreX 

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX, 
                'response': 'C' + str(y),
                # 'case_mode': '=',
                # 'case_val': 0,
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                ## 'weight': 1.0,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
                }

            if DO_ALL_DIGITS:
                cases = [0,1,2,3,4,5,6,7,8,9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                if DO_BUG:
                    execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr="A.hex=%s" % (trainKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                if DO_BUG:
                    execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr="B.hex=%s" % (testKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, 
                    noPoll=True, **kwargs)
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5)
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_selfKey']

                # This seems wrong..what's the format of the cm?
                if 1==0:
                    cm = glm['glm_model']['submodels'][0]['validation']['_cms'][0]['_arr']
                    print "cm:", cm
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                    print "\nTrain\n==========\n"
                    print h2o_gbm.pp_cm(cm)


                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key='B.hex',
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='B.hex',
                    vactual='C' + str(y),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
    def test_GLM2_enums_score_superset(self):
        h2o.beta_features = True
        print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?"
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'max_iter': 1,
                'n_folds': 1,
                'alpha': 0.2,
                'lambda': 1e-5,
                'family': 'binomial'
            }

            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=scoreDataKey,
                                               model_key=modelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            # just get a predict and AUC on the same data. has to be binomial result
            resultAUC = h2o.nodes[0].generate_auc(thresholds=None,
                                                  actual=scoreDataKey,
                                                  predict='Predict.hex',
                                                  vactual=y,
                                                  vpredict=1)
            auc = resultAUC['AUC']
            self.assertAlmostEqual(
                auc,
                0.5,
                delta=0.15,
                msg="actual auc: %s not close enough to 0.5" % auc)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=scoreDataKey,
                predict=predictKey,
                vactual='C' + str(y + 1),
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
Example #53
0
    def test_GLM2_covtype_train(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180)

        execExpr="A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        # Split Test/Train************************************************
        # how many rows for each pct?
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        trainDataKey = "rTrain"
        testDataKey = "rTest"
        # start at 90% rows + 1
        
        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y+1),
            'max_iter': 20, 
            'n_folds': 0, 
            'alpha': 0.1, 
            'lambda': 1e-5, 
            'family': 'binomial',
        }
        timeoutSecs = 180

        for trial in range(10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 

            # test/train split **********************************************8
            h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
            aHack = {'destination_key': trainDataKey}
            parseKey = trainDataKey

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            modelKey = glm['glm_model']['_key']

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y+1),
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
Example #54
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs = 1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=importFolderPath + "/" +
                                       csvFilename,
                                       schema='put',
                                       hex_key=trainKey,
                                       timeoutSecs=timeoutSecs)

        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = {
            'classification': 1,  # faster? 
            'destination_key': modelKey,
            'learn_rate': .1,
            'ntrees': 3,
            'max_depth': 8,
            'min_rows': 1,
            'response':
            0,  # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        #noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                        noPoll=True,
                                        **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200,
                                    pollTimeoutSecs=120,
                                    retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cms = gbmTrainView['gbm_model']['cms']
            cm = cms[-1]['_arr']  # use the last one
            print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr']
            print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr']
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(
                gbmTrainView['gbm_model']['errs'])
Example #55
0
    def test_GBM_poker_1m(self):
        h2o.beta_features = True
        for trial in range(2):
            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'
            timeoutSecs = 900
            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            csvPathname = 'poker/poker-hand-testing.data'
            hex_key = 'poker-hand-testing.data.hex'
            parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 2
            for max_depth in [5,10,20]:
                params = {
                    'learn_rate': .1,
                    'nbins': 10,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': numCols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #56
0
    def test_DeepLearning_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '1024,1024,2048',
            'adaptive_rate'                : 1,
            'rho'                          : 0.99,
            'epsilon'                      : 1e-8,
            'train_samples_per_iteration'  : -1, ## 0: better accuracy!  -1: best scalability!  10000: best accuracy?
#            'rate'                         : 0.01,
#            'rate_annealing'               : 1e-6,
#            'momentum_start'               : 0.5,
#            'momentum_ramp'                : 1800000,
#            'momentum_stable'              : 0.99,
            'l1'                           : 1e-5,
            'l2'                           : 0.0,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            'epochs'                       : 128, #enough for 64 nodes
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000 #don't score until the end
            }

        timeoutSecs = 7200
        start = time.time()
        deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            # response = 378
            response = 'C379'

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Example #58
0
    def test_DeepLearning_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4"

        twoValueList = [
            ('A', 'B', 0, 14),
            ('A', 'B', 1, 14),
            (0, 1, 0, 12),
            (0, 1, 1, 12),
            (0, 1, 'NaN', 12),
            (1, 0, 'NaN', 12),
            (-1, 1, 0, 12),
            (-1, 1, 1, 12),
            (-1e1, 1e1, 1e1, 12),
            (-1e1, 1e1, -1e1, 12),
        ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse,
                              str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'
            validation_key = hex_key

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue,
                                                            outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols': None,
                'response': 'C' + str(response),
                'classification': 1,
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden': '500',
                'rate': 0.01,
                'rate_annealing': 1e-6,
                'momentum_start': 0,
                'momentum_stable': 0,
                'l1': 0.0,
                'l2': 1e-4,
                'seed': 80023842348,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 1.0,
                'destination_key': model_key,
                'validation': hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o_cmd.runDeepLearning(parseResult=parseResult,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)
            print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time(
            ) - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.001
            relTol = 0.01
            predict_key = 'Predict.hex'

            kwargs = {
                'data_key': validation_key,
                'destination_key': predict_key,
                'model_key': model_key
            }
            predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs,
                                               **kwargs)
            h2o_cmd.runInspect(key=predict_key, verbose=True)

            kwargs = {}

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=validation_key,
                vactual=response,
                predict=predict_key,
                vpredict='predict',
                timeoutSecs=timeoutSecs,
                **kwargs)

            cm = predictCMResult['cm']

            print h2o_gbm.pp_cm(cm)
            actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

            print "actual   classification error:" + format(actualErr)
            print "expected classification error:" + format(expectedErr)
            if actualErr != expectedErr and abs(
                (expectedErr - actualErr) / expectedErr) > relTol:
                raise Exception(
                    "Scored classification error of %s is not within %s %% relative error of %s"
                    % (actualErr, float(relTol) * 100, expectedErr))

            trial += 1