Esempio n. 1
0
def scoreRF(scoreParseResult,
            trainResult,
            vactual=None,
            timeoutSecs=120,
            **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    if h2o.beta_features:
        # this is how we're supposed to do scorin?
        rfModelKey = trainResult['drf_model']['_key']
        predictKey = 'Predict.hex'
        start = time.time()
        predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                           model_key=rfModelKey,
                                           destination_key=predictKey,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)

        h2o_cmd.runInspect(key='Predict.hex', verbose=True)

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=parseKey,
            vactual=vactual,
            predict=predictKey,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        rftime = time.time() - start

        cm = predictCMResult['cm']

        # These will move into the h2o_gbm.py
        pctWrong = h2o_gbm.pp_cm_summary(cm)
        print "\nTest\n==========\n"
        print h2o_gbm.pp_cm(cm)
        scoreResult = predictCMResult

    else:
        ntree = trainResult['ntree']
        rfModelKey = trainResult['model_key']
        start = time.time()
        # NOTE: response_variable is required, and passed from kwargs here
        # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set
        # compared to training
        kwargs['out_of_bag_error_estimate'] = 0
        scoreResult = h2o_cmd.runRFView(None,
                                        parseKey,
                                        rfModelKey,
                                        ntree=ntree,
                                        timeoutSecs=timeoutSecs,
                                        **kwargs)

    rftime = time.time() - start
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
Esempio n. 2
0
        def predict_and_compare_csvs(model_key):
            start = time.time()
            predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey)
            print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname,
                msg="Original, after being exec'ed", skipHeader=True)
            (rowNum2, predictOutput)  = compare_csv_last_col(csvPredictPathname, 
                msg="Predicted", skipHeader=True)

            # no header on source
            if (rowNum1 != rowNum2):
                raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \
                    %s" % (rowNum1, rowNum2))

            wrong = 0
            wrong0 = 0
            wrong1 = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                o = float(o)
                p = float(p)
                if o!=p:
                    msg = "Comparing original output col vs predicted. row %s differs. \
                        original: %s predicted: %s"  % (rowNum, o, p)
                    if p==0.0 and wrong0==10:
                        print "Not printing any more predicted=0 mismatches"
                    elif p==0.0 and wrong0<10:
                        print msg
                    if p==1.0 and wrong1==10:
                        print "Not printing any more predicted=1 mismatches"
                    elif p==1.0 and wrong1<10:
                        print msg

                    if p==0.0:
                        wrong0 += 1
                    elif p==1.0:
                        wrong1 += 1

                    wrong += 1

            print "wrong0:", wrong0
            print "wrong1:", wrong1
            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 16.0:
                raise Exception("pct wrong: %s too high. Expect < 16 pct error" % pctWrong)
Esempio n. 3
0
        def predict_and_compare_csvs(model_key):
            start = time.time()
            predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey)
            print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname,
                msg="Original, after being exec'ed", skipHeader=True)
            (rowNum2, predictOutput)  = compare_csv_last_col(csvPredictPathname, 
                msg="Predicted", skipHeader=True)

            # no header on source
            if (rowNum1 != rowNum2):
                raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \
                    %s" % (rowNum1, rowNum2))

            wrong = 0
            wrong0 = 0
            wrong1 = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                o = float(o)
                p = float(p)
                if o!=p:
                    msg = "Comparing original output col vs predicted. row %s differs. \
                        original: %s predicted: %s"  % (rowNum, o, p)
                    if p==0.0 and wrong0==10:
                        print "Not printing any more predicted=0 mismatches"
                    elif p==0.0 and wrong0<10:
                        print msg
                    if p==1.0 and wrong1==10:
                        print "Not printing any more predicted=1 mismatches"
                    elif p==1.0 and wrong1<10:
                        print msg

                    if p==0.0:
                        wrong0 += 1
                    elif p==1.0:
                        wrong1 += 1

                    wrong += 1

            print "wrong0:", wrong0
            print "wrong1:", wrong1
            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 10.0:
                raise Exception("pct wrong too high. Expect < 10% error")
Esempio n. 4
0
File: h2o_rf.py Progetto: Jfeng3/h2o
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    if h2o.beta_features:
        # this is how we're supposed to do scorin?
        rfModelKey  = trainResult['drf_model']['_key']
        predictKey = 'Predict.hex'
        start = time.time()
        predictResult = h2o_cmd.runPredict(
            data_key=parseKey,
            model_key=rfModelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key='Predict.hex', verbose=True)

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=parseKey,
            vactual=vactual,
            predict=predictKey,
            vpredict='predict', 
            timeoutSecs=timeoutSecs, **kwargs)
            
        rftime      = time.time()-start 

        cm = predictCMResult['cm']

        # These will move into the h2o_gbm.py
        pctWrong = h2o_gbm.pp_cm_summary(cm);
        print "\nTest\n==========\n"
        print h2o_gbm.pp_cm(cm)
        scoreResult = predictCMResult

    else:
        ntree = trainResult['ntree']
        rfModelKey  = trainResult['model_key']
        start = time.time()
        # NOTE: response_variable is required, and passed from kwargs here
        # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set
        # compared to training
        kwargs['out_of_bag_error_estimate'] = 0
        scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs)

    rftime      = time.time()-start 
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
Esempio n. 5
0
def scoreRF(scoreParseResult,
            trainResult,
            vactual=None,
            timeoutSecs=120,
            **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    # this is how we're supposed to do scorin?
    rfModelKey = trainResult['drf_model']['_key']
    predictKey = 'Predict.hex'
    start = time.time()
    predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                       model_key=rfModelKey,
                                       destination_key=predictKey,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

    h2o_cmd.runInspect(key='Predict.hex', verbose=True)

    predictCMResult = h2o.nodes[0].predict_confusion_matrix(
        actual=parseKey,
        vactual=vactual,
        predict=predictKey,
        vpredict='predict',
        timeoutSecs=timeoutSecs,
        **kwargs)

    rftime = time.time() - start

    cm = predictCMResult['cm']

    # These will move into the h2o_gbm.py
    pctWrong = h2o_gbm.pp_cm_summary(cm)
    print "\nTest\n==========\n"
    print h2o_gbm.pp_cm(cm)
    scoreResult = predictCMResult

    rftime = time.time() - start
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
Esempio n. 6
0
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs):
    # Run validation on dataset

    parseKey = scoreParseResult['destination_key']
    # this is how we're supposed to do scorin?
    rfModelKey  = trainResult['drf_model']['_key']
    predictKey = 'Predict.hex'
    start = time.time()
    predictResult = h2o_cmd.runPredict(
        data_key=parseKey,
        model_key=rfModelKey,
        destination_key=predictKey,
        timeoutSecs=timeoutSecs, **kwargs)

    h2o_cmd.runInspect(key='Predict.hex', verbose=True)

    predictCMResult = h2o.nodes[0].predict_confusion_matrix(
        actual=parseKey,
        vactual=vactual,
        predict=predictKey,
        vpredict='predict', 
        timeoutSecs=timeoutSecs, **kwargs)
        
    rftime      = time.time()-start 

    cm = predictCMResult['cm']

    # These will move into the h2o_gbm.py
    pctWrong = h2o_gbm.pp_cm_summary(cm);
    print "\nTest\n==========\n"
    print h2o_gbm.pp_cm(cm)
    scoreResult = predictCMResult

    rftime      = time.time()-start 
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))
    scoreResult['python_call_timer'] = rftime
    return scoreResult
Esempio n. 7
0
    def test_GLM_enums_unbalanced(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=testDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y + 1),
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 0,
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.5,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 0
                },
            ]

            # Try each one
            h2o.beta_features = True
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     noPoll=True,
                                     **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300,
                                 pollTimeoutSecs=300,
                                 retryDelaySecs=5,
                                 errorIfCancelled=True)
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)
                print "glm2 end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                if not validation or 'avg_err' not in validation:
                    raise Exception("glm: %s" % h2o.dump_json(glm) + \
                        "\nNo avg_err in validation." + \
                        "\nLikely if you look back, the job was cancelled, so there's no cross validation.")

                avg_err = validation['avg_err']
                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
                print 'beta', beta
                print 'iteration', iteration
                print 'avg_err', avg_err
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception(
                        "Why take so many iterations:  %s in this glm2 training?"
                        % iterations)

            # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key=testDataKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=testDataKey,
                    vactual='C' + str(y),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(
                    pctWrong, 8,
                    "Should see less than 7 pct error (class = 4): %s" %
                    pctWrong)

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                if 1 == 0:
                    # stuff from GLM1

                    classErr = glmScore['validation']['classErr']
                    auc = glmScore['validation']['auc']
                    err = glmScore['validation']['err']
                    nullDev = glmScore['validation']['nullDev']
                    resDev = glmScore['validation']['resDev']
                    h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                    print "score classErr:", classErr
                    print "score err:", err
                    print "score auc:", auc
                    print "score resDev:", resDev
                    print "score nullDev:", nullDev

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                            "resDev:\t", validation['resDev'])
                        raise Exception(emsg)

                    # what is reasonable?
                    # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                    self.assertAlmostEqual(
                        auc,
                        0.5,
                        delta=0.15,
                        msg="actual auc: %s not close enough to 0.5" % auc)

                    if math.isnan(err):
                        emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t",
                                                                     err)
                        raise Exception(emsg)

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                            "resDev:\t", resDev)
                        raise Exception(emsg)

                    if math.isnan(nullDev):
                        emsg = "Why is this nullDev = 'nan'?? %6s %s" % (
                            "nullDev:\t", nullDev)
Esempio n. 8
0
    def test_DeepLearning_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '1024,1024,2048',
            'adaptive_rate'                : 1,
            'rho'                          : 0.99,
            'epsilon'                      : 1e-8,
            'train_samples_per_iteration'  : -1, ## 0: better accuracy!  -1: best scalability!  10000: best accuracy?
#            'rate'                         : 0.01,
#            'rate_annealing'               : 1e-6,
#            'momentum_start'               : 0.5,
#            'momentum_ramp'                : 1800000,
#            'momentum_stable'              : 0.99,
            'l1'                           : 1e-5,
            'l2'                           : 0.0,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            'epochs'                       : 128, #enough for 64 nodes
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000 #don't score until the end
            }

        timeoutSecs = 7200
        start = time.time()
        deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
Esempio n. 9
0
    def test_c10_rel_gbm(self):
        print "not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Test***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        testFilename = 'classification1Test.txt'
        testPathname = importFolderPath + "/" + testFilename

        start = time.time()
        parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        trainFilename = 'classification1Train.txt'
        trainPathname = importFolderPath + "/" + trainFilename

        start = time.time()
        parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', 
            timeoutSecs=500, doSummary=True)
        print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, trainPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # GBM Train***********************************************************
        x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
        # response = 0
        # doesn't work if index is used?
        response = 'outcome'

        # x = range(inspect['num_cols'])
        # del x[response]
        ntrees = 10
        # fails with 40
        params = {
            'learn_rate': .2,
            'nbins': 1024,
            'ntrees': ntrees,
            'max_depth': 20,
            'min_rows': 2,
            'response': response,
            'cols': x,
            # 'ignored_cols_by_name': None,
        }
        print "Using these parameters for GBM: ", params
        kwargs = params.copy()
        modelKey = 'GBMModelKey'

        timeoutSecs = 900

        trainStart = time.time()
        gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
            timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
        trainElapsed = time.time() - trainStart
        print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        # errrs from end of list? is that the last tree?
        errsLast = gbmTrainView['gbm_model']['errs'][-1]
        print "GBM 'errsLast'", errsLast

        # get the last cm
        cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
        pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
        print "Last line of this cm might be NAs, not CM"
        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm)

        # GBM test****************************************
        predictKey = 'Predict.hex'
        h2o_cmd.runInspect(key=parseTestResult['destination_key'])
        start = time.time()
        gbmTestResult = h2o_cmd.runPredict(
            data_key=parseTestResult['destination_key'],
            model_key=modelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename


        if DO_PREDICT_CM:
            gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                actual=parseTestResult['destination_key'],
                vactual='predict',
                predict=predictKey,
                vpredict='predict', # choices are 7 (now) and 'predict'
                )

            # errrs from end of list? is that the last tree?
            # all we get is cm
            cm = gbmPredictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "Last line of this cm is really NAs, not CM"
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
Esempio n. 10
0
    def test_DeepLearning_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue    = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse   = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" 

        twoValueList = [
            ('A','B',0, 14),
            ('A','B',1, 14),
            (0,1,0, 12),
            (0,1,1, 12),
            (0,1,'NaN', 12),
            (1,0,'NaN', 12),
            (-1,1,0, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'
            validation_key = hex_key

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols']
            response = 'C' + str(response)

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'activation'                   : 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden'                       : '113,71,54',
                'rate'                         : 0.01,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0,
                'momentum_stable'              : 0,
                'l1'                           : 0.0,
                'l2'                           : 1e-6,
                'seed'                         : 80023842348,
                'loss'                         : 'CrossEntropy',
                #'max_w2'                       : 15,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 100,
                'destination_key'              : model_key,
                'validation'                   : hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time() - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.00
            relTol = 0.01
            predict_key = 'Predict.hex'

            kwargs = {
                'data_key': validation_key,
                'destination_key': predict_key,
                'model_key': model_key
            }
            predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
            h2o_cmd.runInspect(key=predict_key, verbose=True)

            kwargs = {
            }

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=validation_key,
                vactual=response,
                predict=predict_key,
                vpredict='predict',
                timeoutSecs=timeoutSecs, **kwargs)

            cm = predictCMResult['cm']

            print h2o_gbm.pp_cm(cm)
            actualErr = h2o_gbm.pp_cm_summary(cm)/100.

            print "actual   classification error:" + format(actualErr)
            print "expected classification error:" + format(expectedErr)
            if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
                raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                                (actualErr, float(relTol)*100, expectedErr))


            trial += 1
Esempio n. 11
0
    def test_c9_GLM_airlines_fvec(self):
        h2o.beta_features = True

        files = [('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800,
                  'IsDepDelayed')]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda':
                1e-8,
                'alpha':
                0.0,
                'max_iter':
                30,
                'n_folds':
                3,
                'family':
                'binomial',
                'destination_key':
                "GLMKEY",
                'response':
                response,
                'ignored_cols':
                'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if h2o.beta_features:
                modelKey = glm['glm_model']['_key']

                submodels = glm['glm_model']['submodels']
                # hackery to make it work when there's just one
                validation = submodels[-1]['validation']
                best_threshold = validation['best_threshold']
                thresholds = validation['thresholds']
                # have to look up the index for the cm, from the thresholds list
                best_index = None
                for i, t in enumerate(thresholds):
                    if t == best_threshold:
                        best_index = i
                        break
                cms = validation['_cms']
                cm = cms[best_index]
                pctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])
                # FIX! should look at prediction error/class error?
                # self.assertLess(pctWrong, 9,"Should see less than 40% error")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm['_arr'])

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key=trainKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=trainKey,
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']
                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 40,"Should see less than 40% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Esempio n. 12
0
    def test_DeepLearning_mnist(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '1024,1024,2048',
            'adaptive_rate'                : 1,
            'rho'                          : 0.99,
            'epsilon'                      : 1e-8,
            'train_samples_per_iteration'  : -1, ## 0: better accuracy!  -1: best scalability!  10000: best accuracy?
#            'rate'                         : 0.01,
#            'rate_annealing'               : 1e-6,
#            'momentum_start'               : 0.5,
#            'momentum_ramp'                : 1800000,
#            'momentum_stable'              : 0.99,
            'l1'                           : 1e-5,
            'l2'                           : 0.0,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            'epochs'                       : 128, #enough for 64 nodes
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000 #don't score until the end
            }

        timeoutSecs = 7200
        start = time.time()
        deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        h2o.beta_features = True
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)

        h2o.beta_features = False
Esempio n. 13
0
    def test_GLM2_mnist(self):
        if not SCIPY_INSTALLED:
            pass

        else:
            SYNDATASETS_DIR = h2o.make_syn_dir()

            csvFilelist = [
                (10000, 500, 'cA', 60),
            ]

            trial = 0
            for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist:
                trialStart = time.time()

                # PARSE test****************************************
                csvFilename = 'syn_' + "binary" + "_" + str(
                    rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # GLM****************************************
                modelKey = 'GLM_model'
                y = colCount
                kwargs = {
                    'response': 'C' + str(y + 1),
                    'family': 'binomial',
                    'lambda': 1e-4,
                    'alpha': 0,
                    'max_iter': 15,
                    'n_folds': 1,
                    'beta_epsilon': 1.0E-4,
                    'destination_key': modelKey,
                }

                # GLM wants the output col to be strictly 0,1 integer
                execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (
                    hex_key, y + 1, y + 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                aHack = {'destination_key': 'aHack'}

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=60,
                                     **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                lambdaMax = glm['glm_model']['lambda_max']
                print "lambdaMax:", lambdaMax

                best_threshold = glm['glm_model']['submodels'][0][
                    'validation']['best_threshold']
                print "best_threshold", best_threshold

                # pick the middle one?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][
                    '_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                print "\nPredict\n==========\n"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='aHack',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='aHack',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 50, "Should see less than 50% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Esempio n. 14
0
    def test_GLM2_ints_unbalanced(self):
        h2o.beta_features = True
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'xyz'
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 10,
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-8
                },
                {
                    'alpha': 0.5,
                    'lambda': 0.0
                },
                {
                    'alpha': 0.0,
                    'lambda': 0.0
                },
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

                parseResult = h2i.import_parse(path=csvScorePathname,
                                               schema='put',
                                               hex_key="B.hex",
                                               timeoutSecs=30,
                                               separator=colSepInt)

                predictKey = 'Predict.hex'
                predictResult = h2o_cmd.runPredict(data_key="B.hex",
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual="B.hex",
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Esempio n. 15
0
    def test_NN_airlines_small(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test  = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'airlines_train.hex'
        validation_key = 'airlines_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        # this gives the last col number, which is IsDepDelayed_REC (1 or -1)
        # response = inspect['numCols'] - 1

        # this is "YES"/"NO"
        response = 'IsDepDelayed'

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # get the column names
        colNames = [c['name'] for c in inspect['cols']]
        print "colNames:", colNames
        usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance")

        ignoredCols = []
        for c in colNames:
            # don't put the response in the ignore list (is there a problem if so?)
            if c not in usedCols and c != response:
                ignoredCols.append(c)

        ignoredColsString = ",".join(ignoredCols)
        print "Telling h2o to ignore these cols:"
        print ignoredColsString

        kwargs = {
            'ignored_cols'                 : ignoredColsString,
            'response'                     : response,
            'classification'               : 1,
            'destination_key'              : model_key,
            }
        expectedErr = 0.45 ## expected validation error for the above model
        relTol = 0.50 ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        h2o.beta_features = True
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
            raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                            (actualErr, float(relTol)*100, expectedErr))

        h2o.beta_features = False
Esempio n. 16
0
    def test_GLM2_ints_unbalanced(self):
        h2o.beta_features = True
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'xyz'
            kwargs = {
                'destination_key': modelKey,
                'response': y, 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 10, 
                'alpha': 0, 
                'lambda': 0, 
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-8},
                {'alpha': 0.5, 'lambda': 0.0},
                {'alpha': 0.0, 'lambda': 0.0},
            ]


            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

                parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex",
                    timeoutSecs=30, separator=colSepInt)

                predictKey = 'Predict.hex'
                predictResult = h2o_cmd.runPredict(
                    data_key="B.hex",
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual="B.hex",
                    vactual='C' + str(y+1),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Esempio n. 17
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz',
                 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex'
                 )
            ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            h2o.beta_features = False  #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='s3n',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                noPoll=h2o.beta_features,
                                                doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               noPoll=h2o.beta_features,
                                               doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey,
                                                           testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols':
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'],
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    # hack
                    if h2o.beta_features:
                        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                         pollTimeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict',  # choices are 0 and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)

            h2o.beta_features = False

            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                                  fListTitle, fList, fLabel)
Esempio n. 18
0
    def test_c10_rel_gbm(self):
        h2o.beta_features = True
        print "not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Test***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        testFilename = 'classification1Test.txt'
        testPathname = importFolderPath + "/" + testFilename

        start = time.time()
        parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        trainFilename = 'classification1Train.txt'
        trainPathname = importFolderPath + "/" + trainFilename

        start = time.time()
        parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', 
            timeoutSecs=500, doSummary=True)
        print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, trainPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # GBM Train***********************************************************
        x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
        # response = 0
        # doesn't work if index is used?
        response = 'outcome'

        # x = range(inspect['num_cols'])
        # del x[response]
        ntrees = 10
        # fails with 40
        params = {
            'learn_rate': .2,
            'nbins': 1024,
            'ntrees': ntrees,
            'max_depth': 20,
            'min_rows': 2,
            'response': response,
            'cols': x,
            # 'ignored_cols_by_name': None,
        }
        print "Using these parameters for GBM: ", params
        kwargs = params.copy()
        modelKey = 'GBMModelKey'

        timeoutSecs = 900

        trainStart = time.time()
        gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
            timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
        trainElapsed = time.time() - trainStart
        print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        # errrs from end of list? is that the last tree?
        errsLast = gbmTrainView['gbm_model']['errs'][-1]
        print "GBM 'errsLast'", errsLast

        # get the last cm
        cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
        pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
        print "Last line of this cm might be NAs, not CM"
        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm)

        # GBM test****************************************
        predictKey = 'Predict.hex'
        h2o_cmd.runInspect(key=parseTestResult['destination_key'])
        start = time.time()
        gbmTestResult = h2o_cmd.runPredict(
            data_key=parseTestResult['destination_key'],
            model_key=modelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename


        if DO_PREDICT_CM:
            gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                actual=parseTestResult['destination_key'],
                vactual='predict',
                predict=predictKey,
                vpredict='predict', # choices are 7 (now) and 'predict'
                )

            # errrs from end of list? is that the last tree?
            # all we get is cm
            cm = gbmPredictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "Last line of this cm is really NAs, not CM"
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
    def test_GBM_params_rand2(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                print "Regression!"
                params = {'response': 54, 'ignored_cols_by_name': '5,6,7,8,9', 'ntrees': 2, 'classification': 0}
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
                print "FIX! where do we get the summary info on the test data after predict?"

            h2o.beta_features = False
Esempio n. 20
0
    def test_GLM2_enums_score_superset(self):
        h2o.beta_features = True
        print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?"
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, "cD", 300),
            (n, 2, "cE", 300),
            (n, 3, "cF", 300),
            (n, 4, "cG", 300),
            (n, 5, "cH", 300),
            (n, 6, "cI", 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = "2c"  # comma
            colSepChar = colSepHexString.decode("hex")
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = "0a"  # newline
            rowSepChar = rowSepHexString.decode("hex")
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(
                csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar
            )

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(
                csvScorePathname,
                enumListForScore,
                rowCount,
                colCount,
                SEEDPERFILE,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
            )

            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            print "\n" + csvFilename
            (
                missingValuesDict,
                constantValuesDict,
                enumSizeDict,
                colTypeDict,
                colNameDict,
            ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True)

            y = colCount
            modelKey = "enums"
            kwargs = {
                "destination_key": modelKey,
                "response": y,
                "max_iter": 1,
                "n_folds": 1,
                "alpha": 0.2,
                "lambda": 1e-5,
                "family": "binomial",
            }

            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(
                path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt
            )

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = "Predict.hex"
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs
            )

            # just get a predict and AUC on the same data. has to be binomial result
            resultAUC = h2o.nodes[0].generate_auc(
                thresholds=None, actual=scoreDataKey, predict="Predict.hex", vactual=y, vpredict=1
            )
            auc = resultAUC["AUC"]
            self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=scoreDataKey, predict=predictKey, vactual="C" + str(y + 1), vpredict="predict"
            )

            cm = predictCMResult["cm"]

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
Esempio n. 21
0
    def test_NN_airlines_small(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'airlines_train.hex'
        validation_key = 'airlines_test.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        # this gives the last col number, which is IsDepDelayed_REC (1 or -1)
        # response = inspect['numCols'] - 1

        # this is "YES"/"NO"
        response = 'IsDepDelayed'

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # get the column names
        colNames = [c['name'] for c in inspect['cols']]
        print "colNames:", colNames
        usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier",
                    "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance")

        ignoredCols = []
        for c in colNames:
            # don't put the response in the ignore list (is there a problem if so?)
            if c not in usedCols and c != response:
                ignoredCols.append(c)

        ignoredColsString = ",".join(ignoredCols)
        print "Telling h2o to ignore these cols:"
        print ignoredColsString

        kwargs = {
            'ignored_cols': ignoredColsString,
            'response': response,
            'classification': 1,
            'destination_key': model_key,
        }
        expectedErr = 0.45  ## expected validation error for the above model
        relTol = 0.50  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Esempio n. 22
0
    def test_GBM_params_rand2(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
            # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
            ('standard', 'covtype.shuffled.10pct.sorted.data',
             'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data',
             'covtype.test.hex')
        ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=importFolderPath + "/" +
                                                trainFilename,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(
                key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                params = {
                    'response': 54,
                    'ignored_cols_by_name': 'C1,C2,C3,C4,C5',
                    'ntrees': 2,
                    'validation': parseTestResult['destination_key'],
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'],
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                if DO_PREDICT_CM:
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual='predict',
                        predict=predictKey,
                        vpredict='predict',  # choices are 7 (now) and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                if 'max_depth' in params and params['max_depth']:
                    xList.append(params['max_depth'])
                    eList.append(pctWrongTrain)
                    fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrongTrain'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Esempio n. 23
0
    def test_DeepLearning_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        csvPathname_test  = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        validation_key = 'test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, 
            timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY)
        parseResultV = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, 
            timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        response = 'any_response'

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # use defaults otherwise
        # need to change epochs otherwise it takes too long
        kwargs = {
            'epochs'                       : 0.001,
            'response'                     : response,
            'destination_key'              : model_key,
            'validation'                   : validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03 ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.046
        relTol = 0.35 # allow 35% tolerance. kbn
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
            raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                            (actualErr, float(relTol)*100, expectedErr))
Esempio n. 24
0
    def test_GBM_covtype_train_test(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        # h2b.browseTheCloud()

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            ntrees = 2
            # fails with 40
            for max_depth in [40, 5]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 7 (now) and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Esempio n. 25
0
    def test_GLM2_tmp(self):
        importFolderPath = "/tmp"
        csvFilename = 's.csv'
        bcFilename = 'bc.csv'

        csvPathname = importFolderPath + "/" + csvFilename
        bcPathname = importFolderPath + "/" + bcFilename

        hex_key = csvFilename + ".hex"
        bc_key = bcFilename + ".hex"

        # Parse
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        bcResult = h2i.import_parse(path=bcPathname,
                                    schema='put',
                                    hex_key=bc_key,
                                    timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=bc_key)
        print "\n" + bcPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        # Split Test/Train************************************************
        # how many rows for each pct?
        numRows = inspect['numRows']
        trainDataKey = hex_key
        testDataKey = hex_key

        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': "response",
            'non_negative': 0,
            'standardize': 1,
            'strong_rules': 1,
            'alpha': 0,
            'max_iter': 100,
            'lambda_min_ratio': -1,
            'higher_accuracy': 1,
            'beta_constraints': bc_key,
            'link': "family_default",
            'use_all_factor_levels': 0,
            'variable_importances': 0,
            'lambda': 0,
            'prior': 0.00301875221383974,
            'nlambdas': -1,
            'source': hex_key,
            'lambda_search': 0,
            'disable_line_search': 0,
            'n_folds': 0,
            'family': "binomial",
            'beta_epsilon': 1e-04,
            'intercept': 1,
            'max_predictors': -1,
            # "used_cols"':  "4,5,18,37,38,53,66,73,90,93,95,96,112,117,135,158,165,166,168,177,180",
            # 'ignored_cols': "1,2,3,4,5,6,7,8,9,11,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27,29,31,32,34,35,36,37,38,40,41,42,43,44,45,46,47,48,49,51,52,53,54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,91,92,93,94,95,96,97,98,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,119,120,121,123,124,125,126,128,129,133,134,135,136,137,138,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,173,174,176,177,178,179",
        }

        timeoutSecs = 180

        for trial in range(10):
            parseKey = trainDataKey

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            modelKey = glm['glm_model']['_key']

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=testDataKey,
                                               model_key=modelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='response',
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            self.assertLess(pctWrong, 8, "Should see less than 7% error")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Esempio n. 26
0
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 7 (now) and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']
    def test_GLM2_covtype_train_predict_all_all(self):
        importFolderPath = "standard"
        csvFilename = "covtype.shuffled.data"
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(
            bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=180
        )

        execExpr = "A.hex=%s" % parseResult["destination_key"]
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1)  # class 1
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
            inspect["numCols"]
        )

        print "Use same data (full) for train and test"
        trainDataKey = "A.hex"
        testDataKey = "A.hex"
        # start at 90% rows + 1

        # GLM, predict, CM*******************************************************8
        kwargs = {
            "response": "C" + str(y + 1),
            "max_iter": 20,
            "n_folds": 0,
            # 'alpha': 0.1,
            # 'lambda': 1e-5,
            "alpha": 0.0,
            "lambda": None,
            "family": "binomial",
        }
        timeoutSecs = 60

        for trial in range(1):
            # test/train split **********************************************8
            aHack = {"destination_key": trainDataKey}

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm["glm_model"]["_key"]
            submodels = glm["glm_model"]["submodels"]
            # hackery to make it work when there's just one
            validation = submodels[-1]["validation"]
            best_threshold = validation["best_threshold"]
            thresholds = validation["thresholds"]

            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i, t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation["_cms"]
            cm = cms[best_index]
            trainPctWrong = h2o_gbm.pp_cm_summary(cm["_arr"])

            # Score **********************************************
            predictKey = "Predict.hex"
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs
            )

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey, vactual="C" + str(y + 1), predict=predictKey, vpredict="predict"
            )

            cm = predictCMResult["cm"]

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            self.assertEqual(
                pctWrong, trainPctWrong, "Should see the same error rate on train and predict? (same data set)"
            )

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Esempio n. 28
0
    def test_DeepLearning_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        validation_key = 'test.hex'
        timeoutSecs = 300
        parseResult = h2i.import_parse(path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=DO_SUMMARY)
        parseResultV = h2i.import_parse(path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=DO_SUMMARY)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        response = 'any_response'

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # use defaults otherwise
        # need to change epochs otherwise it takes too long
        kwargs = {
            'epochs': 0.001,
            'response': response,
            'destination_key': model_key,
            'validation': validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03  ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.046
        relTol = 0.35  # allow 35% tolerance. kbn
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Esempio n. 29
0
    def test_GLM2_mnist(self):
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema=schema,
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs)

            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTestResult['destination_key'],
                timeoutSecs=300,
                returnIgnoreX=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema=schema,
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTrainResult['destination_key'],
                timeoutSecs=300,
                returnIgnoreX=True)
            print "ignoreX:", ignoreX

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX,
                'response': 'C' + str(y + 1),
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
            }

            if DO_ALL_DIGITS:
                cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                if DO_BUG:
                    execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (
                        trainKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "A.hex=%s" % (trainKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                if DO_BUG:
                    execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (
                        testKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "B.hex=%s" % (testKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack,
                                                timeoutSecs=timeoutSecs,
                                                pollTimeoutSecs=60,
                                                noPoll=True,
                                                **kwargs)
                print "\nglmFirstResult:", h2o.dump_json(glmFirstResult)
                job_key = glmFirstResult['job_key']
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                            pollTimeoutSecs=60,
                                            retryDelaySecs=5)

                # double check...how come the model is bogus?
                h2o_jobs.pollWaitJobs()
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][
                    -1]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='B.hex',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='B.hex',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 9,
                                "Should see less than 9% error (class = 4)")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Esempio n. 30
0
    def test_GBM_params_rand2(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                params = {
                    'response': 54, 
                    'ignored_cols_by_name': 
                    '0,1,2,3,4', 
                    'ntrees': 2,
                    'validation': parseTestResult['destination_key'],
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                if DO_PREDICT_CM:
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual='predict',
                        predict=predictKey,
                        vpredict='predict', # choices are 7 (now) and 'predict'
                        )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cms'][-1] # use the last one

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                if 'max_depth' in params and params['max_depth']:
                    xList.append(params['max_depth'])
                    eList.append(pctWrongTrain)
                    fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrongTrain'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Esempio n. 31
0
    def test_GLM2_enums_score_superset(self):
        h2o.beta_features = True
        print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?"
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'max_iter': 1,
                'n_folds': 1,
                'alpha': 0.2,
                'lambda': 1e-5,
                'family': 'binomial'
            }

            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=scoreDataKey,
                                               model_key=modelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            # just get a predict and AUC on the same data. has to be binomial result
            resultAUC = h2o.nodes[0].generate_auc(thresholds=None,
                                                  actual=scoreDataKey,
                                                  predict='Predict.hex',
                                                  vactual=y,
                                                  vpredict=1)
            auc = resultAUC['AUC']
            self.assertAlmostEqual(
                auc,
                0.5,
                delta=0.15,
                msg="actual auc: %s not close enough to 0.5" % auc)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=scoreDataKey,
                predict=predictKey,
                vactual='C' + str(y + 1),
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
Esempio n. 32
0
    def test_GBM_regression_rand2(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
            # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
            ('standard', 'covtype.shuffled.10pct.sorted.data',
             'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data',
             'covtype.test.hex')
        ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=importFolderPath + "/" +
                                                trainFilename,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", trainKey

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", testKey

            paramsDict = define_gbm_params()
            for trial in range(3):
                # use this to set any defaults you want if the pick doesn't set
                print "Regression!"
                params = {
                    'response': 'C55',
                    # 'ignored_cols_by_name': 'C5,C6,C7,C8,C9',
                    'ntrees': 2,
                    'classification': 0,
                    'validation': testKey,
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                print "gbmTrainView:", h2o.dump_json(gbmTrainView)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                # for regression, the cms are all null, so don't print

                # GBM test****************************************
                predictKey = 'Predict.hex'
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(data_key=testKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
                print "FIX! where do we get the summary info on the test data after predict?"
Esempio n. 33
0
    def test_DeepLearning_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4"

        twoValueList = [
            ('A', 'B', 0, 14),
            ('A', 'B', 1, 14),
            (0, 1, 0, 12),
            (0, 1, 1, 12),
            (0, 1, 'NaN', 12),
            (1, 0, 'NaN', 12),
            (-1, 1, 0, 12),
            (-1, 1, 1, 12),
            (-1e1, 1e1, 1e1, 12),
            (-1e1, 1e1, -1e1, 12),
        ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse,
                              str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'
            validation_key = hex_key

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue,
                                                            outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols': None,
                'response': 'C' + str(response),
                'classification': 1,
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden': '500',
                'rate': 0.01,
                'rate_annealing': 1e-6,
                'momentum_start': 0,
                'momentum_stable': 0,
                'l1': 0.0,
                'l2': 1e-4,
                'seed': 80023842348,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 1.0,
                'destination_key': model_key,
                'validation': hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o_cmd.runDeepLearning(parseResult=parseResult,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)
            print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time(
            ) - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.001
            relTol = 0.01
            predict_key = 'Predict.hex'

            kwargs = {
                'data_key': validation_key,
                'destination_key': predict_key,
                'model_key': model_key
            }
            predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs,
                                               **kwargs)
            h2o_cmd.runInspect(key=predict_key, verbose=True)

            kwargs = {}

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=validation_key,
                vactual=response,
                predict=predict_key,
                vpredict='predict',
                timeoutSecs=timeoutSecs,
                **kwargs)

            cm = predictCMResult['cm']

            print h2o_gbm.pp_cm(cm)
            actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

            print "actual   classification error:" + format(actualErr)
            print "expected classification error:" + format(expectedErr)
            if actualErr != expectedErr and abs(
                (expectedErr - actualErr) / expectedErr) > relTol:
                raise Exception(
                    "Scored classification error of %s is not within %s %% relative error of %s"
                    % (actualErr, float(relTol) * 100, expectedErr))

            trial += 1
Esempio n. 34
0
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        if h2o.beta_features:
            kwargs = {'str': execExpr, 'timeoutSecs': 15}
        else:
            kwargs = {'expression': execExpr, 'timeoutSecs': 15}

        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        if h2o.beta_features:
            paramDict = drf2ParamDict
            params = {
                'ntrees': 20, 
                'destination_key': 'RF_model'
            }
        else:
            paramDict = drf1ParamDict
            params = {
                'ntree': 20, 
                'out_of_bag_error_estimate': 1, 
                'model_key': 'RF_model'
            }

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        if h2o.beta_features:
            timeoutSecs = 30 + kwargs['ntrees'] * 60
        else:
            timeoutSecs = 30 + kwargs['ntree'] * 60 

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
            timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        if h2o.beta_features:
            model_key = kwargs['destination_key']
            ntree = kwargs['ntrees']
        else:
            model_key = kwargs['model_key']
            ntree = kwargs['ntree']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None, dataKeyTest, 
                model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(classification_error, 50, delta=50, 
                msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey  = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=parseKey,
                model_key=rfModelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C54',
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Esempio n. 35
0
    def test_GLM2_covtype20x_train(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180)

        execExpr="A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        # Split Test/Train************************************************
        # how many rows for each pct?
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        trainDataKey = "rTrain"
        testDataKey = "rTest"
        # start at 90% rows + 1
        
        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y),
            'max_iter': 20, 
            'n_folds': 0, 
            'alpha': 0.1, 
            'lambda': 1e-5, 
            'family': 'binomial',
            'classification': 1,
        }
        timeoutSecs = 60

        for trial in range(100):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 

            # test/train split **********************************************8
            h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
            aHack = {'destination_key': trainDataKey}
            parseKey = trainDataKey

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            modelKey = glm['glm_model']['_key']

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y),
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
Esempio n. 36
0
    def test_GLM_enums_unbalanced(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey,
                timeoutSecs=30, separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y+1), 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 0, 
                'alpha': 0, 
                'lambda': 0, 
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.5, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 0},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                if not validation or 'avg_err' not in validation:
                    raise Exception("glm: %s" % h2o.dump_json(glm) + \
                        "\nNo avg_err in validation." + \
                        "\nLikely if you look back, the job was cancelled, so there's no cross validation.")
        
                avg_err = validation['avg_err']
                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
                print 'beta', beta
                print 'iteration', iteration
                print 'avg_err', avg_err
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception("Why take so many iterations:  %s in this glm2 training?" % iterations)

               # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key=testDataKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=testDataKey,
                    vactual='C' + str(y+1),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong)

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)


                if 1==0:
                    # stuff from GLM1

                    classErr = glmScore['validation']['classErr']
                    auc = glmScore['validation']['auc']
                    err = glmScore['validation']['err']
                    nullDev = glmScore['validation']['nullDev']
                    resDev = glmScore['validation']['resDev']
                    h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                    print "score classErr:", classErr
                    print "score err:", err
                    print "score auc:", auc
                    print "score resDev:", resDev
                    print "score nullDev:", nullDev

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev'])
                        raise Exception(emsg)

                    # what is reasonable?
                    # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                    self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                    if math.isnan(err):
                        emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                        raise Exception(emsg)

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev)
                        raise Exception(emsg)

                    if math.isnan(nullDev):
                        emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(num_cols)
            del x[response]
            ignored_cols_by_name = ",".join(map(str,random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            


                if FORCE_FAIL_CASE:
                    params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} 

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                print "This is crazy!"
                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Esempio n. 38
0
    def test_c9_GLM_rc_fvec(self):
        h2o.beta_features = True

        files = [
                 ('c16', '140k_train_anonymised.csv', 'rc.hex', 1800,  None)
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            # avoid printing the coefficient names in jenkins output
            # the last col is the response, so we use a number to point to it below
            parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path=csvPathname, schema='local', hex_key=trainKey, 
                header=0, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            response = numCols-1

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda': 1e-8,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 0,
                'family': 'binomial',
                'destination_key': "GLMKEY",
                'response': response,
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if h2o.beta_features:
                modelKey = glm['glm_model']['_key']

                submodels = glm['glm_model']['submodels']
                # hackery to make it work when there's just one
                validation = submodels[-1]['validation']
                best_threshold = validation['best_threshold']
                thresholds = validation['thresholds']
                # have to look up the index for the cm, from the thresholds list
                best_index = None
                for i,t in enumerate(thresholds):
                    if t == best_threshold:
                        best_index = i
                        break
                cms = validation['_cms']
                cm = cms[best_index]
                pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
                # FIX! should look at prediction error/class error?
                # self.assertLess(pctWrong, 9,"Should see less than 40% error")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm['_arr'])

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key=trainKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=trainKey,
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']
                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 40,"Should see less than 40% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)


        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Esempio n. 39
0
    def test_GBM_regression_rand2(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C54', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", trainKey

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", testKey

            paramsDict = define_gbm_params()
            for trial in range(3):
                # use this to set any defaults you want if the pick doesn't set
                print "Regression!"
                params = {'response': 'C54', 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0}
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                # for regression, the cms are all null, so don't print

                # GBM test****************************************
                predictKey = 'Predict.hex'
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=testKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
                print "FIX! where do we get the summary info on the test data after predict?"
    def test_GLM2_covtype_train_predict_all_all(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180)

        execExpr="A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 1) # class 1
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "Use same data (full) for train and test"
        trainDataKey = "A.hex"
        testDataKey = "A.hex"
        # start at 90% rows + 1
        
        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y+1),
            'max_iter': 20, 
            'n_folds': 0, 
            # 'alpha': 0.1, 
            # 'lambda': 1e-5, 
            'alpha': 0.0,
            'lambda': None,
            'family': 'binomial',
        }
        timeoutSecs = 60

        for trial in range(1):
            # test/train split **********************************************8
            aHack = {'destination_key': trainDataKey}

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']
            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']

            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i,t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y+1),
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            self.assertEqual(pctWrong, trainPctWrong,"Should see the same error rate on train and predict? (same data set)")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Esempio n. 41
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if h2o.localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols': 
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'], 
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict', # choices are 0 and 'predict'
                        )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)


            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Esempio n. 42
0
    def test_GLM2_mnist_reals(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz",    600), 
        ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put',
                hex_key=testKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put',
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM****************************************
            print "This is the pruned x GLM will use"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x

            modelKey = "mnist"
            params = {
                'response': y,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey
                }

            # for c in [0,1,2,3,4,5,6,7,8,9]:
            # just do a couple digits
            for c in [0,7]:
                print "Trying binomial with case:", c
                execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                kwargs = params.copy()

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)

               # Score **********************************************
                execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                print "Problems with test data having different enums than train? just use train for now"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key="B.hex",
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual="B.hex",
                    vactual='C' + str(y+1),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong)

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Esempio n. 43
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'standard/covtype.shuffled.90pct.data'
        csvPathname_test = 'standard/covtype.shuffled.10pct.data'
        hex_key = 'covtype.hex'
        validation_key = 'covtype.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname_train,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets',
                                        path=csvPathname_test,
                                        schema='local',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'adaptive_rate': 0,
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 96.0,
            'destination_key': model_key,
            'validation': validation_key,
            'score_interval': 10000
        }
        expectedErr = 0.24  ## expected validation error for the above model
        relTol = 0.20  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Esempio n. 44
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '117,131,129',
            'adaptive_rate'                : 0,
            'rate'                         : 0.005,
            'rate_annealing'               : 1e-6,
            'momentum_start'               : 0.5,
            'momentum_ramp'                : 100000,
            'momentum_stable'              : 0.9,
            'l1'                           : 0.00001,
            'l2'                           : 0.0000001,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs'                       : 2.0,
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000
            }
        expectedErr = 0.057 ## expected validation error for the above model
        relTol = 0.20 ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
            raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                            (actualErr, float(relTol)*100, expectedErr))
Esempio n. 45
0
    def test_rf_covtype20x_fvec(self):
        h2o.beta_features = True
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        kwargs = {'str': execExpr, 'timeoutSecs': 15}
        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        paramDict = drf2ParamDict
        params = {'ntrees': 20, 'destination_key': 'RF_model'}

        colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        timeoutSecs = 30 + kwargs['ntrees'] * 60

        start = time.time()
        rf = h2o_cmd.runRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs,
                           retryDelaySecs=1,
                           **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntree = kwargs['ntrees']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runRFView(None,
                          dataKeyTrain,
                          model_key,
                          ntree=ntree,
                          timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runRFView(None,
                                       dataKeyTest,
                                       model_key,
                                       ntree=ntree,
                                       timeoutSecs=timeoutSecs,
                                       out_of_bag_error_estimate=0,
                                       retryDelaySecs=1)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(
                classification_error,
                50,
                delta=50,
                msg="Classification error %s differs too much" %
                classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=parseKey,
                                               model_key=rfModelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C55',
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Esempio n. 46
0
    def test_GLM2_covtype_train(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180)

        execExpr="A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        # Split Test/Train************************************************
        # how many rows for each pct?
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        trainDataKey = "rTrain"
        testDataKey = "rTest"
        # start at 90% rows + 1
        
        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y+1),
            'max_iter': 20, 
            'n_folds': 0, 
            'alpha': 0.1, 
            'lambda': 1e-5, 
            'family': 'binomial',
        }
        timeoutSecs = 180

        for trial in range(10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 

            # test/train split **********************************************8
            h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
            aHack = {'destination_key': trainDataKey}
            parseKey = trainDataKey

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            modelKey = glm['glm_model']['_key']

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y+1),
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
Esempio n. 47
0
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1)

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response+1),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response+1),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Esempio n. 48
0
    def test_GBM_covtype_train_test(self):
        h2o.beta_features = False
        bucket = "home-0xdiag-datasets"
        modelKey = "GBMModelKey"
        files = [
            (
                "standard",
                "covtype.shuffled.90pct.data",
                "covtype.train.hex",
                1800,
                54,
                "covtype.shuffled.10pct.data",
                "covtype.test.hex",
            )
        ]

        # h2b.browseTheCloud()

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False  # turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(
                bucket=bucket,
                path=importFolderPath + "/" + trainFilename,
                schema="local",
                hex_key=trainKey,
                timeoutSecs=timeoutSecs,
                noPoll=h2o.beta_features,
                doSummary=False,
            )
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult["destination_key"] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "train parse result:", parseTrainResult["destination_key"]

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(
                bucket=bucket,
                path=importFolderPath + "/" + testFilename,
                schema="local",
                hex_key=testKey,
                timeoutSecs=timeoutSecs,
                noPoll=h2o.beta_features,
                doSummary=False,
            )
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult["destination_key"] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "test parse result:", parseTestResult["destination_key"]

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult["destination_key"])
            x = range(inspect["num_cols"])
            del x[response]
            ntrees = 2
            # fails with 40
            for max_depth in [40, 5]:
                params = {
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": ntrees,
                    "max_depth": max_depth,
                    "min_rows": 10,
                    "response": response,
                    "ignored_cols_by_name": None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult["destination_key"])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(
                    parseResult=parseTrainResult,
                    noPoll=True,
                    timeoutSecs=timeoutSecs,
                    destination_key=modelKey,
                    **kwargs
                )
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView["gbm_model"]["errs"][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView["gbm_model"]["cms"][5]  # use the mid point
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = "Predict.hex"
                h2o_cmd.runInspect(key=parseTestResult["destination_key"])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult["destination_key"],
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs,
                )
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult["destination_key"],
                    vactual=response,
                    predict=predictKey,
                    vpredict="predict",  # choices are 7 (now) and 'predict'
                )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult["cms"][-1]  # use the last one

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = "max_depth"
            eLabel = "pctWrong"
            fLabel = "trainElapsed"
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Esempio n. 49
0
    def test_GLM2_mnist(self):
        h2o.beta_features = True
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs)
            
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True)
            print "ignoreX:", ignoreX 

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX, 
                'response': 'C' + str(y),
                # 'case_mode': '=',
                # 'case_val': 0,
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                ## 'weight': 1.0,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
                }

            if DO_ALL_DIGITS:
                cases = [0,1,2,3,4,5,6,7,8,9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                if DO_BUG:
                    execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr="A.hex=%s" % (trainKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                if DO_BUG:
                    execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr="B.hex=%s" % (testKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, 
                    noPoll=True, **kwargs)
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5)
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_selfKey']

                # This seems wrong..what's the format of the cm?
                if 1==0:
                    cm = glm['glm_model']['submodels'][0]['validation']['_cms'][0]['_arr']
                    print "cm:", cm
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                    print "\nTrain\n==========\n"
                    print h2o_gbm.pp_cm(cm)


                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key='B.hex',
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='B.hex',
                    vactual='C' + str(y),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Esempio n. 50
0
    def test_GLM2_covtype_train_predict_all_all(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=180)

        execExpr = "A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, 1)  # class 1
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "Use same data (full) for train and test"
        trainDataKey = "A.hex"
        testDataKey = "A.hex"
        # start at 90% rows + 1

        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y + 1),
            'max_iter': 20,
            'n_folds': 0,
            # 'alpha': 0.1,
            # 'lambda': 1e-5,
            'alpha': 0.0,
            'lambda': None,
            'family': 'binomial',
        }
        timeoutSecs = 60

        for trial in range(1):
            # test/train split **********************************************8
            aHack = {'destination_key': trainDataKey}

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']
            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']

            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i, t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=testDataKey,
                                               model_key=modelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y + 1),
                predict=predictKey,
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            self.assertEqual(
                pctWrong, trainPctWrong,
                "Should see the same error rate on train and predict? (same data set)"
            )

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Esempio n. 51
0
    def test_NN2_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 90
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 20.0,
            'destination_key': model_key,
            'validation': validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03  ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.046
        relTol = 0.1
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Esempio n. 52
0
    def test_GLM2_mnist(self):
        if not SCIPY_INSTALLED:
            pass

        else:    
            h2o.beta_features = True
            SYNDATASETS_DIR = h2o.make_syn_dir()

            csvFilelist = [
                (10000, 500, 'cA', 60),
            ]

            trial = 0
            for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist:
                trialStart = time.time()

                # PARSE test****************************************
                csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='put', 
                    hex_key=hex_key, timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # GLM****************************************
                modelKey = 'GLM_model'
                y = colCount 
                kwargs = {
                    'response': 'C' + str(y+1),
                    'family': 'binomial',
                    'lambda': 1e-4, 
                    'alpha': 0,
                    'max_iter': 15,
                    'n_folds': 1,
                    'beta_epsilon': 1.0E-4,
                    'destination_key': modelKey,
                    }

                # GLM wants the output col to be strictly 0,1 integer
                execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (hex_key, y+1, y+1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                aHack = {'destination_key': 'aHack'}

                
                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                lambdaMax = glm['glm_model']['lambda_max']
                print "lambdaMax:", lambdaMax

                best_threshold= glm['glm_model']['submodels'][0]['validation']['best_threshold']
                print "best_threshold", best_threshold

                # pick the middle one?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                print "\nPredict\n==========\n"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key='aHack',
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='aHack',
                    vactual='C' + str(y+1),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                self.assertLess(pctWrong, 50,"Should see less than 50% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            # response = 378
            response = 'C379'

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_c9_GLM_airlines_hdfs(self):
        files = [
                 ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM (train)****************************************
            params = {
                # 'lambda': 1e-4,
                # 'alpha': 0.5,
                'lambda': 1e-8,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 3,
                'family': 'binomial',
                'destination_key': "GLMKEY",
                'response': response,
                'ignored_cols': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
            }
            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs,**kwargs)
            elapsed = time.time() - start
            print "GLM training completed in", elapsed, "seconds. On dataset: ", csvFilename
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']

            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']
            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i,t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
            # FIX! should look at prediction error/class error?
            # self.assertLess(pctWrong, 9,"Should see less than 40% error")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm['_arr'])

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=trainKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=trainKey,
                vactual=response,
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']
            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            # self.assertLess(pctWrong, 40,"Should see less than 40% error")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)


        h2i.delete_keys_at_all_nodes(timeoutSecs=600)