def test_rf_multinomial_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_multinomial.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 400
        colCount = 7

        for trial in range (5):
            write_syn_dataset(csvPathname, totalRows, colCount, headerData)
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hexKey = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs)
            print "trial #", trial, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            modelKey = rfView['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
                vactual=colCount+1, vpredict=1, expectedAuc=0.5, doAUC=False)

            h2b.browseJsonHistoryAsUrlLastMatch("RF")
Beispiel #2
0
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    hex_key = csvFilename + ".hex"
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"


    # save and restore the model
    h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1)
    # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not)
    h2o.nodes[0].load_model(path=modelPathname)

    start = time.time()
    glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
        vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    if self.glmScore1:
        h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1)
    else:
        self.glmScore1 = copy.deepcopy(glmScore)
Beispiel #3
0
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    hex_key = csvFilename + ".hex"
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"


    # save and restore the model
    h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1)
    # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not)
    h2o.nodes[0].load_model(path=modelPathname)

    start = time.time()
    glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
        vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    if self.glmScore1:
        h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1)
    else:
        self.glmScore1 = copy.deepcopy(glmScore)
    def test_rf_multinomial_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_multinomial.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 400
        colCount = 7

        for trial in range(5):
            write_syn_dataset(csvPathname, totalRows, colCount, headerData)
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hexKey = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hexKey,
                                           doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   timeoutSecs=15,
                                   pollTimeoutSecs=5,
                                   **kwargs)
            print "trial #", trial, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            modelKey = rfView['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'],
                             modelKey=modelKey,
                             vactual=colCount + 1,
                             vpredict=1,
                             expectedAuc=0.5,
                             doAUC=False)

            h2b.browseJsonHistoryAsUrlLastMatch("RF")
    def test_rf_enums_mappings(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            # (n, 4, 'cG', 300),
            # (n, 5, 'cH', 300),
            # (n, 6, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
        ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname,
                                         enumList,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname,
                                         enumListForScore,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            # use mtries so both look at all cols at every split? doesn't matter for speedrf
            # does speedrf try one more time? with 3 cols, mtries=2, so another try might
            # get a look at the missing col
            # does matter for drf2. does it "just stop"
            # trying mtries always looking at all columns or 1 col might be interesting
            if SPEEDRF:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'ntrees': 1,
                    'max_depth': 100,
                    # 'oobee': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'mtries': COLS,
                }
            elif GBM:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                    # 'learn_rate': .1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'classification': 1,
                }
            else:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'nbins': 1024,
                    'mtries': COLS,
                }

            for r in range(2):
                start = time.time()

                if GBM:
                    gbmResult = h2o_cmd.runGBM(parseResult=parseResult,
                                               timeoutSecs=timeoutSecs,
                                               pollTimeoutSecs=180,
                                               **kwargs)

                    print "gbm end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    # print h2o.dump_json(gbmResult)
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult)

                elif SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                                  timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=180,
                                                  **kwargs)
                    print "speedrf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             pollTimeoutSecs=180,
                                             **kwargs)
                    print "rf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                h2o_cmd.runScore(dataKey=scoreDataKey,
                                 modelKey=modelKey,
                                 vactual=y,
                                 vpredict=1,
                                 doAUC=not MULTINOMIAL)  # , expectedAuc=0.5)

                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
Beispiel #6
0
    def test_GLM2_ints_unbalanced(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'xyz'
            kwargs = {
                'n_folds': 0,
                'destination_key': modelKey,
                'response': y,
                'max_iter': 200,
                'family': 'binomial',
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-5
                },
                # {'alpha': 0.25, 'lambda': 1e-4},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

                parseResult = h2i.import_parse(path=csvScorePathname,
                                               schema='put',
                                               hex_key="B.hex",
                                               timeoutSecs=30,
                                               separator=colSepInt)

                h2o_cmd.runScore(dataKey="B.hex",
                                 modelKey=modelKey,
                                 vactual='C' + str(y + 1),
                                 vpredict=1,
                                 expectedAuc=0.45)
Beispiel #7
0
    def test_rf_enums_score_superset_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for rf model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            ntrees = 5
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'classification': 1,
                'ntrees': ntrees,
                'validation': scoreDataKey,
            }

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
            print "rf end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult,
                                                     ntree=ntrees)
            predictKey = 'Predict.hex'
            h2o_cmd.runScore(dataKey=scoreDataKey,
                             modelKey=modelKey,
                             vactual=y,
                             vpredict=1,
                             expectedAuc=0.5)
Beispiel #8
0
    def test_rf_many_rooz_enums_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if 1==0 and localhost:
            n = 4000
            tryList = [
                (n, 999, 'cI', 300), 
                ]
        else:
            n = 100
            tryList = [
                (n, 1, 'cD', 300), 
                (n, 2, 'cE', 300), 
                (n, 3, 'cF', 300), 
                (n, 4, 'cG', 300), 
                (n, 5, 'cH', 300), 
                (n, 6, 'cI', 300), 
                (n, 7, 'cJ', 300), 
                (n, 9, 'cK', 300), 
                (n, 10, 'cLA', 300), 
                (n, 11, 'cDA', 300), 
                (n, 12, 'cEA', 300), 
                (n, 13, 'cFA', 300), 
                (n, 14, 'cGA', 300), 
                (n, 15, 'cHA', 300), 
                (n, 16, 'cIA', 300), 
                (n, 17, 'cJA', 300), 
                (n, 19, 'cKA', 300), 
                (n, 20, 'cLA', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # can randomly pick the row and col cases.
            ### colSepCase = random.randint(0,1)
            colSepCase = 1
            # using the comma is nice to ensure no craziness
            if (colSepCase==0):
                colSepHexString = '01'
            else:
                colSepHexString = '2c' # comma

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0,1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase==0):
                rowSepHexString = '0a' # newline
            else:
                rowSepHexString = '0d0a' # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            if DO_TEN_INTEGERS:
                csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            else:
                csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            # we allow some NAs in the list above
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],exceptionOnMissingValues=False)

            y = colCount
            ntrees = 5
            kwargs = {
                'response': y, 
                'classification': 1,
                'ntrees': ntrees,
            }
            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult, 
                timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "rf end on ", csvPathname, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees)
            modelKey = rfResult['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey,
                vactual=colCount, vpredict=1, expectedAuc=0.5, doAUC=False)
Beispiel #9
0
    def test_GLM2_enums_score_subset(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 500
        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key="score_" + hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'response': y,
                'max_iter': 8,
                'family': 'binomial',
                'n_folds': 2,
                'alpha': 0.2,
                'lambda': 1e-5
            }
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            modelKey = glm['glm_model']['_key']
            h2o_cmd.runScore(dataKey="score_" + hex_key,
                             modelKey=modelKey,
                             vactual=y,
                             vpredict=1,
                             expectedAuc=0.6)
Beispiel #10
0
    def test_rf_enums_mappings(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            # (n, 4, 'cG', 300),
            # (n, 5, 'cH', 300),
            # (n, 6, 'cI', 300),
            (ROWS, COLS, "cI", 300),
            (ROWS, COLS, "cI", 300),
            (ROWS, COLS, "cI", 300),
        ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = "2c"  # comma
            colSepChar = colSepHexString.decode("hex")
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = "0a"  # newline
            rowSepChar = rowSepHexString.decode("hex")
            print "rowSepChar:", rowSepChar

            csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(
                csvPathname,
                enumList,
                rowCount,
                colCount,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
                SEED=SEED_FOR_TRAIN,
            )

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(
                csvScorePathname,
                enumListForScore,
                rowCount,
                colCount,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
                SEED=SEED_FOR_SCORE,
            )
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(
                path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt
            )

            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            print "\n" + csvFilename
            (
                missingValuesDict,
                constantValuesDict,
                enumSizeDict,
                colTypeDict,
                colNameDict,
            ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True)

            y = colCount
            modelKey = "enums"
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            # use mtries so both look at all cols at every split? doesn't matter for speedrf
            # does speedrf try one more time? with 3 cols, mtries=2, so another try might
            # get a look at the missing col
            # does matter for drf2. does it "just stop"
            # trying mtries always looking at all columns or 1 col might be interesting
            if SPEEDRF:
                kwargs = {
                    "sample_rate": 0.999,
                    "destination_key": modelKey,
                    "response": y,
                    "ntrees": 1,
                    "max_depth": 100,
                    # 'oobee': 1,
                    "validation": hex_key,
                    # 'validation': scoreDataKey,
                    "seed": 123456789,
                    "mtries": COLS,
                }
            elif GBM:
                kwargs = {
                    "destination_key": modelKey,
                    "response": y,
                    "validation": scoreDataKey,
                    "seed": 123456789,
                    # 'learn_rate': .1,
                    "ntrees": 1,
                    "max_depth": 100,
                    "min_rows": 1,
                    "classification": 1,
                }
            else:
                kwargs = {
                    "sample_rate": 0.999,
                    "destination_key": modelKey,
                    "response": y,
                    "classification": 1,
                    "ntrees": 1,
                    "max_depth": 100,
                    "min_rows": 1,
                    "validation": hex_key,
                    # 'validation': scoreDataKey,
                    "seed": 123456789,
                    "nbins": 1024,
                    "mtries": COLS,
                }

            for r in range(2):
                start = time.time()

                if GBM:
                    gbmResult = h2o_cmd.runGBM(
                        parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs
                    )

                    print "gbm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                    # print h2o.dump_json(gbmResult)
                    (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult)

                elif SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(
                        parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs
                    )
                    print "speedrf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                    (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                else:
                    rfResult = h2o_cmd.runRF(
                        parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs
                    )
                    print "rf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                    (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                h2o_cmd.runScore(
                    dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL
                )  # , expectedAuc=0.5)

                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
    def test_rf_enums_score_superset_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 3, 'cF', 300), 
            (n, 4, 'cG', 300), 
            (n, 5, 'cH', 300), 
            (n, 6, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for rf model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, 
                timeoutSecs=30, separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            ntrees = 5
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'classification': 1,
                'ntrees': ntrees,
                'validation': scoreDataKey,
            }

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees)
            predictKey = 'Predict.hex'
            h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
    def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True, expectedAuc=0.5):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between 
        # shuffled and non-shuffled datasets
        importFolderPath = "standard"
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
            hex_key=hex_key, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = numRows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        trial = 0
        for rowPct in [0.9]:
            trial += 1
            # Not using this now (did use it for slicing)
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r_" + csvFilename + "_" + str(trial)
            
            # just do random split for now
            dataKeyTrain = 'rTrain.hex'
            dataKeyTest = 'rTest.hex'

            response = "C55"
            h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, 
                outputCol=numCols-1, changeToBinomial=not DO_MULTINOMIAL)
            sliceResult = {'destination_key': dataKeyTrain}

            # adjust timeoutSecs with the number of trees
            kwargs = paramDict.copy()
            kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial)
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that)
            rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, validation=dataKeyTest, **kwargs)

            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs)
            # oobeTrainPctRight = 100 * (1.0 - error)
            oobeTrainPctRight = 100 - error
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest, modelKey=model_key, 
                vactual=response, vpredict=1, expectedAuc=expectedAuc)
            print h2o.dump_json(rfvScoring)
            h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            print "hello7"
            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            fullScorePctRight = 100 - error

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        return rfvScoring
    def test_GLM2_enums_score_subset(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 500
        tryList = [
            # (n, 1, 'cD', 300), 
            # (n, 2, 'cE', 300), 
            # (n, 3, 'cF', 300), 
            (n, 4, 'cG', 300), 
            (n, 5, 'cH', 300), 
            (n, 6, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, 
                timeoutSecs=30, separator=colSepInt)


            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {'response': y, 'max_iter': 8, 'family': 'binomial', 'n_folds': 2, 'alpha': 0.2, 'lambda': 1e-5}
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            modelKey = glm['glm_model']['_key']
            h2o_cmd.runScore(dataKey="score_" + hex_key, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
    def test_GLM_enums_unbalanced(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey,
                timeoutSecs=30, separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y+1), 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 0, 
                'alpha': 0, 
                'lambda': 0, 
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.5, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 0},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length

                print 'beta', beta
                print 'iteration', iteration
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception("Why take so many iterations:  %s in this glm2 training?" % iterations)

               # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
    def test_rf_enums_mappings_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            # (n, 1, 'cD', 300), 
            # (n, 2, 'cE', 300), 
            # (n, 3, 'cF', 300), 
            # (n, 4, 'cG', 300), 
            # (n, 5, 'cH', 300), 
            # (n, 6, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, 
                timeoutSecs=30, separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            if SPEEDRF:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'num_trees': 1,
                    'max_depth': 100,
                    'oobee': 1,
                    'seed': 123456789,
                }
            else:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                }

            for r in range(4):
                start = time.time()
                
                if SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                
                print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
                # print h2o.dump_json(rfResult)
                (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)
                h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5)
                
                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
    def test_GLM_enums_unbalanced(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=testDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y + 1),
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 0,
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.5,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 0
                },
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm2 end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length

                print 'beta', beta
                print 'iteration', iteration
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception(
                        "Why take so many iterations:  %s in this glm2 training?"
                        % iterations)

            # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                h2o_cmd.runScore(dataKey=testDataKey,
                                 modelKey=modelKey,
                                 vactual=y,
                                 vpredict=1,
                                 expectedAuc=0.5)
    def test_GLM2_ints_unbalanced(self):
        h2o.beta_features = True
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'xyz'
            kwargs = {
                'n_folds': 0,
                'destination_key': modelKey,
                'response': y, 
                'max_iter': 200, 
                'family': 'binomial',
                'alpha': 0, 
                'lambda': 0, 
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-5},
                # {'alpha': 0.25, 'lambda': 1e-4},
            ]


            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

                parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex",
                    timeoutSecs=30, separator=colSepInt)

                h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, 
                    vactual='C' + str(y+1), vpredict=1, expectedAuc=0.6)
Beispiel #18
0
    def rf_covtype_train_oobe(self,
                              csvFilename,
                              checkExpectedResults=True,
                              expectedAuc=0.5):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between
        # shuffled and non-shuffled datasets
        importFolderPath = "standard"
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = numRows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [
            0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79
        ]
        expectScorePctRightList = [
            0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78
        ]

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]

        trial = 0
        for rowPct in [0.9]:
            trial += 1
            # Not using this now (did use it for slicing)
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r_" + csvFilename + "_" + str(trial)

            # just do random split for now
            dataKeyTrain = 'rTrain.hex'
            dataKeyTest = 'rTest.hex'

            response = "C55"
            h2o_cmd.createTestTrain(hex_key,
                                    dataKeyTrain,
                                    dataKeyTest,
                                    trainPercent=90,
                                    outputClass=4,
                                    outputCol=numCols - 1,
                                    changeToBinomial=not DO_MULTINOMIAL)
            sliceResult = {'destination_key': dataKeyTrain}

            # adjust timeoutSecs with the number of trees
            kwargs = paramDict.copy()
            kwargs['destination_key'] = "model_" + csvFilename + "_" + str(
                trial)
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that)
            rfv = h2o_cmd.runRF(parseResult=sliceResult,
                                timeoutSecs=timeoutSecs,
                                validation=dataKeyTest,
                                **kwargs)

            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs)
            # oobeTrainPctRight = 100 * (1.0 - error)
            oobeTrainPctRight = 100 - error
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest,
                                          modelKey=model_key,
                                          vactual=response,
                                          vpredict=1,
                                          expectedAuc=expectedAuc)
            print h2o.dump_json(rfvScoring)
            h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            print "hello7"
            (error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            fullScorePctRight = 100 - error

            h2o.nodes[0].generate_predictions(model_key=model_key,
                                              data_key=dataKeyTest)

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / numRows), "pct. of all rows"

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectTrainPctRightList, actualTrainPctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectScorePctRightList, actualScorePctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        return rfvScoring