Ejemplo n.º 1
0
    def notest_GenParity1(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

        # two row dataset gets this. Avoiding it for now
        # java.lang.ArrayIndexOutOfBoundsException: 1
        # at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange(10, 20, 10):
            shCmdString = "perl " + parityPl + " 128 4 " + str(
                x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = "1,2,3,4,5,6"
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange(10, 20, 10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runSpeeDRF(parseResult=parseResult,
                               response=8,
                               ntrees=trees,
                               timeoutSecs=timeoutSecs)
            timeoutSecs += 2
Ejemplo n.º 2
0
 def test_speedrf_params_rand2_fvec(self):
     h2o.beta_features = True
     csvPathname = 'standard/covtype.data'
     hex_key = 'covtype.data.hex'
     for trial in range(10):
         # params is mutable. This is default.
         # response is required for SpeeERF
         params = {
             'response': 'C55', 
             'ntrees': 1, 'mtries': 7, 
             'balance_classes': 0, 
             # never run with unconstrained balance_classes size if random sets balance_classes..too slow
             'max_after_balance_size': 2,
             'importance': 0}
         colX = h2o_util.pickRandParams(paramDict, params)
         if 'cols' in params and params['cols']:
             # exclusion
             if 'ignored_cols_by_name' in params:
                 params['ignored_cols_by_name'] = None
         else:
             if 'ignored_cols_by_name' in params and params['ignored_cols_by_name']:
                 params['mtries'] = random.randint(1,53)
             else:
                 params['mtries'] = random.randint(1,54)
             
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 80 + ((kwargs['ntrees']*80) * max(1,kwargs['mtries']/60) )
         start = time.time()
         parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key)
         h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Ejemplo n.º 3
0
 def notest_RF_poker100(self):
     h2o.beta_features = True
     trees = 6
     timeoutSecs = 20
     csvPathname = "poker/poker100"
     parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put")
     h2o_cmd.runSpeeDRF(parseResult=parseResult, num_trees=trees, timeoutSecs=timeoutSecs)
Ejemplo n.º 4
0
 def notest_RF_iris2(self):
     h2o.beta_features = True
     trees = ",".join(map(str,range(1,4)))
     timeoutSecs = 20
     csvPathname = 'iris/iris2.csv'
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
Ejemplo n.º 5
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

        # two row dataset gets this. Avoiding it for now
        # java.lang.ArrayIndexOutOfBoundsException: 1
        # at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange(10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange(10,100,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runSpeeDRF(parseResult=parseResult, response=8, ntrees=trees, timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
Ejemplo n.º 6
0
    def test_RF_poker100(self):
        MISSING_RESPONSE = True
        trees = ",".join(map(str, range(1, 4)))
        trees = "1,2"
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              response='C11',
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append((job_key, model_key))

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key in jobs:
            gridResult = h2o.nodes[0].speedrf_grid_view(
                job_key=job_key, destination_key=model_key)
            # h2o_rf.showRFGridResults(GBMResult, 15)

            print "speedrf grid result for %s:", model_key, h2o.dump_json(
                gridResult)
Ejemplo n.º 7
0
    def test_RF_poker100(self):
        MISSING_RESPONSE = False
        DO_MODEL_INSPECT = False
        trees = ",".join(map(str,range(10,50,2)))
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append( (job_key, model_key) )

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key  in jobs:

            gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key)
            print "speedrf grid result for %s:", h2o.dump_json(gridResult)

            print "speedrf grid result errors:", gridResult['prediction_errors']
            for i,j in enumerate(gridResult['jobs']):
                if DO_MODEL_INSPECT:
                    print "\nspeedrf result %s:" % i, h2o.dump_json(h2o_cmd.runInspect(key=j['destination_key']))
                else:
                    # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    print "model:", h2o.dump_json(model)
Ejemplo n.º 8
0
 def notest_RF_poker100(self):
     trees = 6
     timeoutSecs = 20
     csvPathname = 'poker/poker100'
     parseResult = h2i.import_parse(bucket='smalldata',
                                    path=csvPathname,
                                    schema='put')
     h2o_cmd.runSpeeDRF(parseResult=parseResult,
                        ntrees=trees,
                        timeoutSecs=timeoutSecs)
Ejemplo n.º 9
0
 def notest_RF_iris2(self):
     trees = ",".join(map(str, range(1, 4)))
     timeoutSecs = 20
     csvPathname = 'iris/iris2.csv'
     parseResult = h2i.import_parse(bucket='smalldata',
                                    path=csvPathname,
                                    schema='put')
     h2o_cmd.runSpeeDRF(parseResult=parseResult,
                        ntrees=trees,
                        timeoutSecs=timeoutSecs)
Ejemplo n.º 10
0
 def notest_RF_iris2(self):
     h2o.beta_features = True
     trees = 6
     timeoutSecs = 20
     csvPathname = 'iris/iris2.csv'
     parseResult = h2i.import_parse(bucket='smalldata',
                                    path=csvPathname,
                                    schema='put')
     h2o_cmd.runSpeeDRF(parseResult=parseResult,
                        ntrees=trees,
                        timeoutSecs=timeoutSecs)
Ejemplo n.º 11
0
        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], 
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
Ejemplo n.º 12
0
    def test_speedrf_params_rand2_fvec(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        hex_key = 'covtype.data.hex'
        for trial in range(10):
            # params is mutable. This is default.
            # response is required for SpeeERF
            params = {
                'response': 'C55',
                'ntrees': 1,
                'mtries': 7,
                'balance_classes': 0,
                # never run with unconstrained balance_classes size if random sets balance_classes..too slow
                'max_after_balance_size': 2,
                'importance': 0
            }
            colX = h2o_util.pickRandParams(paramDict, params)
            if 'cols' in params and params['cols']:
                # exclusion
                if 'ignored_cols_by_name' in params:
                    params['ignored_cols_by_name'] = None
            else:
                if 'ignored_cols_by_name' in params and params[
                        'ignored_cols_by_name']:
                    params['mtries'] = random.randint(1, 53)
                else:
                    params['mtries'] = random.randint(1, 54)

            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 80 + (
                (kwargs['ntrees'] * 80) * max(1, kwargs['mtries'] / 60))
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            h2o_cmd.runSpeeDRF(parseResult=parseResult,
                               timeoutSecs=timeoutSecs,
                               retryDelaySecs=1,
                               **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)
Ejemplo n.º 13
0
    def test_rf_float_bigexp_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 1000
        colCount = 7
        write_syn_dataset(csvPathname, totalRows, colCount, headerData)

        for trial in range(5):
            # grow the data set
            rowData = rand_rowData(colCount)
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, colCount, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'response': 'AGE',
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        timeoutSecs=15,
                                        pollTimeoutSecs=15,
                                        **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, ntree=ntree)

            inspect = h2o_cmd.runInspect(key=hex_key)
            cols = inspect['cols']
            #num_cols = inspect['num_cols']
            #for i,c in enumerate(cols):
            #    if i < (num_cols-1): # everything except the last col (output) should be 8 byte float
            #        colType = c['type']
            #        self.assertEqual(colType, 'float', msg="col %d should be type Real: %s" % (i, colType))

            h2o.check_sandbox_for_errors()
Ejemplo n.º 14
0
 def test_RF_poker100(self):
     trees = ",".join(map(str,range(1,4)))
     timeoutSecs = 20
     csvPathname = 'poker/poker100'
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
     job_key = rfResult['job_key']
     model_key = rfResult['destination_key']
     gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key)
     print "speedrf grid result:", h2o.dump_json(gridResult)
Ejemplo n.º 15
0
    def test_RF_poker100(self):
        MISSING_RESPONSE = False
        DO_MODEL_INSPECT = False
        trees = ",".join(map(str, range(10, 50, 2)))
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                              response='C11',
                                              ntrees=trees,
                                              timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append((job_key, model_key))

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key in jobs:

            gridResult = h2o.nodes[0].speedrf_grid_view(
                job_key=job_key, destination_key=model_key)
            print "speedrf grid result for %s:", h2o.dump_json(gridResult)

            print "speedrf grid result errors:", gridResult[
                'prediction_errors']
            for i, j in enumerate(gridResult['jobs']):
                if DO_MODEL_INSPECT:
                    print "\nspeedrf result %s:" % i, h2o.dump_json(
                        h2o_cmd.runInspect(key=j['destination_key']))
                else:
                    # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key'])
                    model = h2o.nodes[0].speedrf_view(
                        modelKey=j['destination_key'])
                    print "model:", h2o.dump_json(model)
Ejemplo n.º 16
0
    def test_rf_float_bigexp_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 1000
        colCount = 7
        write_syn_dataset(csvPathname, totalRows, colCount, headerData)

        for trial in range (5):
            # grow the data set
            rowData = rand_rowData(colCount)
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, colCount, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            ntree = 2
            kwargs = {
                'response': 'AGE',
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=15, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, ntree=ntree)

            inspect = h2o_cmd.runInspect(key=hex_key)
            cols = inspect['cols']
            #num_cols = inspect['num_cols']
            #for i,c in enumerate(cols):
            #    if i < (num_cols-1): # everything except the last col (output) should be 8 byte float
            #        colType = c['type']
            #        self.assertEqual(colType, 'float', msg="col %d should be type Real: %s" % (i, colType))
        
            h2o.check_sandbox_for_errors()
Ejemplo n.º 17
0
    def test_RF_poker100(self):
        MISSING_RESPONSE = True
        trees = ",".join(map(str,range(1,4)))
        trees = "1,2"
        timeoutSecs = 20
        csvPathname = 'poker/poker100'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
        jobs = []
        for i in range(1):
            if MISSING_RESPONSE:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            else:
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs)
            job_key = rfResult['job_key']
            model_key = rfResult['destination_key']
            jobs.append( (job_key, model_key) )

        h2o_jobs.pollWaitJobs(timeoutSecs=300)

        for job_key, model_key  in jobs:
            gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key)
            # h2o_rf.showRFGridResults(GBMResult, 15)

            print "speedrf grid result for %s:", model_key, h2o.dump_json(gridResult)
Ejemplo n.º 18
0
        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   ntrees=ntrees,
                                   max_depth=40,
                                   response=response,
                                   timeoutSecs=600,
                                   retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        ntrees=ntrees,
                                        max_depth=40,
                                        response=response,
                                        timeoutSecs=600,
                                        retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,
                    k) in enumerate(zip(classErrorPctList1,
                                        classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i],
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(
                totalError1,
                totalError2,
                delta=.2 * totalError2,
                msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(
                elapsed1,
                elapsed2,
                delta=.5 * elapsed2,
                msg="Comparing RF times for DRF2 and SpeeDRF")
Ejemplo n.º 19
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvTrainPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=180,
                                            doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvTestPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200

            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i == (TRIES - 1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult,
                                              timeoutSecs=timeoutSecs,
                                              noPoll=True,
                                              **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)

            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            print "rfView:", h2o.dump_json(rfView)

            rfView["drf_model"] = rfView.pop("speedrf_model")
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(
                len(classErrorPctList), 7,
                "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict"
            )
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Ejemplo n.º 20
0
    def test_RF_mnist_both(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult,
         importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                          path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed,
             parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           testCsvFilename,
                                           hex_key=testKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           parsePattern,
                                           hex_key=trainKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            # print "This is the 'ignore=' we'll use"
            # no longer use. depend on h2o to get it right.
            ntree = 25
            params = {
                'response': 0,
                'ntrees': ntree,
                # 'data_key='mnist_training.csv.hex'
                'mtries':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 2147483647,
                'select_stat_type': 'ENTROPY',
                'sampling_strategy': 'RANDOM',
                'sample_rate': 0.67,
                'oobee': 1,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'destination_key': 'RF_model',
                'nbins': 1024,
                # 'seed': 784834182943470027,
                # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            if rfSeed is None:
                params['seed'] = random.randint(0, sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        timeoutSecs=timeoutSecs,
                                        pollTimeoutSecs=180,
                                        retryDelaySecs=2,
                                        **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # RFView (score on test)****************************************
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            # was 2.84
            # sometimes get 2.87?
            self.assertAlmostEqual(
                classification_error,
                1.6,
                delta=1.6,
                msg="Classification error %s differs too much" %
                classification_error)

            treeStats = rfView['speedrf_model']['treeStats']
            leaves = {
                'min': treeStats['minLeaves'],
                'mean': treeStats['meanLeaves'],
                'max': treeStats['maxLeaves']
            }
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (
                    params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {
                'min': treeStats['minDepth'],
                'mean': treeStats['meanDepth'],
                'max': treeStats['maxDepth']
            }
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s depth: %s expected: %s pct. different %s" % (
                    params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            modelKey = rfView['speedrf_model']['_key']
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"

        for d in allDelta:
            print d
Ejemplo n.º 21
0
    def test_rf_predict3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1==1:
            y = 4 # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            translate = None
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        else:
            y = 0 # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True)

        def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=hexKey, destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
                                                               msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
                                                               msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
                raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                # if float(o)!=float(p):
                if str(o)!=str(p):
                    if wrong==10:
                        print "Not printing any more mismatches\n"
                    elif wrong<10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s"  % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 2.0:
                raise Exception("pctWrong too high. Expect < 2% error because it's reusing training data")
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        kwargs = {
            'destination_key': 'rf_model',
            'response': response,
            'ntrees': trees,
            'classification': 1,
            }


        rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        rfResult["drf_model"] = rfResult.pop("speedrf_model")
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y
        pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, 
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now
        self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 0.7,
                               msg="predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
Ejemplo n.º 22
0
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"

        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename,
                                       hex_key=hex_key, timeoutSecs=15, schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            model_key = ""
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'SRF_model'
            else:
                model_key = 'SRF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)
            kwargs['response'] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key,
                          timeoutSecs=300, noPoll=False, **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            print "\n MODEL KEY: ", model_key
            rfViewInitial.append(h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60))

        # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            model_key = rfView["speedrf_model"]['_key']
            ntree = rfView["speedrf_model"]["parameters"]['ntrees']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            if first is None: # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
Ejemplo n.º 23
0
    def test_rf_big1_nopoll_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"

        print "\n" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=hex_key,
                                       timeoutSecs=15,
                                       schema='put')
        rfViewInitial = []
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            model_key = ""
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'SRF_model'
            else:
                model_key = 'SRF_model' + str(jobDispatch)
            kwargs['ntrees'] = 1

            if OVERWRITE_RF_MODEL:
                print "Change the number of trees, while keeping the rf model key name the same"
                print "Checks that we correctly overwrite previous rf model"
                kwargs['ntrees'] += 1

            kwargs['seed'] = random.randint(0, sys.maxint)
            kwargs['response'] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runSpeeDRF(node=randomNode,
                               parseResult=parseResult,
                               destination_key=model_key,
                               timeoutSecs=300,
                               noPoll=False,
                               **kwargs)
            print "rf job dispatch end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            print "\n MODEL KEY: ", model_key
            rfViewInitial.append(
                h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60))

        # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        first = None
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            model_key = rfView["speedrf_model"]['_key']
            ntree = rfView["speedrf_model"]["parameters"]['ntrees']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None,
                                                  model_key,
                                                  timeoutSecs=60)
            if first is None:  # we'll use this to compare the others
                first = rfViewResult.copy()
                firstModelKey = model_key
                print "first", h2o.dump_json(first)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       first,
                                       vice_versa=True,
                                       with_values=True)

                print "df.difference:", h2o.dump_json(df.difference)
Ejemplo n.º 24
0
    def test_speedrf_mnist(self):
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600), 
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600), 
            ("train.csv.gz", "test.csv.gz", 600),
            ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + testCsvFilename,
                                           hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 784 # last column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + trainCsvFilename,
                                           hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True)
            ntrees = 10
            params = {
                'response': y,
                'ignored_cols_by_name': ignore_x,
                'ntrees': ntrees,
                'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 15,
                'sample_rate': 0.67,
                'destination_key': 'SpeeDRF_model',
                'nbins': 1024,
                'seed': 784834182943470027,
                'oobee': 1,
                }
            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            rfv["drf_model"] = rfv.pop("speedrf_model")
            h2o_rf.simpleCheckRFView(None, rfv, **params)
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            print "Total trees: ", used_trees
            print "On data key: ", data_key
            print "Produced model key: ", model_key
    def test_rf_change_data_key_fvec(self):
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvPathname,
                                            timeoutSecs=500)
        h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseResult = parseResult
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest[
            'destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be
        # considered the "first RFView" times..subsequent have some caching?.
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        params = {'ntrees': 2, 'destination_key': 'RF_model'}

        # colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        kwargs["response"] = "C55"
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow

        timeoutSecs = 100
        start = time.time()
        h2o_cmd.runSpeeDRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs,
                           retryDelaySecs=1,
                           noPoll=True,
                           **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)

        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model',
                              timeoutSecs=360,
                              pollTimeoutSecs=120,
                              retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntrees = kwargs['ntrees']
        start = time.time()
        h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time(
        ) - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            # FIX! should update this expected classification error
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds.'

            print "Trial #", trial, "completed"
Ejemplo n.º 26
0
    def test_rf_change_data_key_fvec(self):
        importFolderPath = 'standard'

        csvFilenameTrain = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
        h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # we could train on covtype, and then use covtype20x for test? or vice versa
        # parseResult = parseResult
        # dataKeyTest = dataKeyTrain
        csvFilenameTest = 'covtype20x.data'
        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']

        print "Parse end", dataKeyTest

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        params = {
            'ntrees': 2,
            'destination_key': 'RF_model'
        }

        # colX = h2o_rf.pickRandRfParams(paramDict, params)
        kwargs = params.copy()
        kwargs["response"] = "C55"
        # adjust timeoutSecs with the number of trees
        # seems ec2 can be really slow

        timeoutSecs = 100
        start = time.time()
        h2o_cmd.runSpeeDRF(parseResult=parseResultTrain,
                            timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs)
        print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'
        ### print "rf response:", h2o.dump_json(rfv)


        start = time.time()
        h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntrees = kwargs['ntrees']
        start = time.time()
        h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(3):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runSpeeDRFView(None,  model_key,timeoutSecs)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            # FIX! should update this expected classification error
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            print "Trial #", trial, "completed"
Ejemplo n.º 27
0
    def test_RF_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]

        tryList = [
            (10000, 100, 'cA', 300),
            (10000, 300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            (10000, 3000, 'cK', 3600),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'RFModelKey'

            # Parse (train)****************************************
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=None,
                                                path=csvPathname,
                                                schema='put',
                                                header=0,
                                                hex_key=hex_key,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # RF(train iterate)****************************************
            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'nbins': 1024,
                    'classification': 1,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'response': 'C' + str(numCols - 1),
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for RF: ", params
                kwargs = params.copy()

                trainStart = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult,
                                              timeoutSecs=timeoutSecs,
                                              destination_key=modelKey,
                                              **kwargs)
                trainElapsed = time.time() - trainStart
                print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(
                    max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)
                rfResult["drf_model"] = rfResult.pop("speedrf_model")
                errsLast = rfResult['drf_model']['errs'][-1]
                print "RF 'errsLast'", errsLast

                cm = rfResult['drf_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Ejemplo n.º 28
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key,
                                            timeoutSecs=180, doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList  = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList  = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200


            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i==(TRIES-1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs,
                                         noPoll=True, **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)


            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            print "rfView:", h2o.dump_json(rfView)

            rfView["drf_model"] = rfView.pop("speedrf_model")
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Ejemplo n.º 29
0
    def test_speedrf_mnist(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            ("train.csv.gz", "test.csv.gz", 600),
        ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=importFolderPath + "/" +
                                           testCsvFilename,
                                           hex_key=testKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 784  # last column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=importFolderPath + "/" +
                                           trainCsvFilename,
                                           hex_key=trainKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseResult['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntrees = 10
            params = {
                'response': y,
                'ignored_cols_by_name': ignore_x,
                'ntrees': ntrees,
                'mtries':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 15,
                'sample_rate': 0.67,
                'destination_key': 'SpeeDRF_model',
                'nbins': 1024,
                'seed': 784834182943470027,
                'oobee': 1,
            }
            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=60,
                                     retryDelaySecs=2,
                                     **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            rfv["drf_model"] = rfv.pop("speedrf_model")
            h2o_rf.simpleCheckRFView(None, rfv, **params)
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            print "Total trees: ", used_trees
            print "On data key: ", data_key
            print "Produced model key: ", model_key
Ejemplo n.º 30
0
    def test_rf_enums_mappings(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            # (n, 4, 'cG', 300),
            # (n, 5, 'cH', 300),
            # (n, 6, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
        ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname,
                                         enumList,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname,
                                         enumListForScore,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            # use mtries so both look at all cols at every split? doesn't matter for speedrf
            # does speedrf try one more time? with 3 cols, mtries=2, so another try might
            # get a look at the missing col
            # does matter for drf2. does it "just stop"
            # trying mtries always looking at all columns or 1 col might be interesting
            if SPEEDRF:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'ntrees': 1,
                    'max_depth': 100,
                    # 'oobee': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'mtries': COLS,
                }
            elif GBM:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                    # 'learn_rate': .1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'classification': 1,
                }
            else:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'nbins': 1024,
                    'mtries': COLS,
                }

            for r in range(2):
                start = time.time()

                if GBM:
                    gbmResult = h2o_cmd.runGBM(parseResult=parseResult,
                                               timeoutSecs=timeoutSecs,
                                               pollTimeoutSecs=180,
                                               **kwargs)

                    print "gbm end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    # print h2o.dump_json(gbmResult)
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult)

                elif SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                                  timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=180,
                                                  **kwargs)
                    print "speedrf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             pollTimeoutSecs=180,
                                             **kwargs)
                    print "rf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                h2o_cmd.runScore(dataKey=scoreDataKey,
                                 modelKey=modelKey,
                                 vactual=y,
                                 vpredict=1,
                                 doAUC=not MULTINOMIAL)  # , expectedAuc=0.5)

                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
Ejemplo n.º 31
0
 def notest_RF_poker100(self):
     trees = 6
     timeoutSecs = 20
     csvPathname = 'poker/poker100'
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
Ejemplo n.º 32
0
    def test_rf_enums_mappings(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            # (n, 4, 'cG', 300),
            # (n, 5, 'cH', 300),
            # (n, 6, 'cI', 300),
            (ROWS, COLS, "cI", 300),
            (ROWS, COLS, "cI", 300),
            (ROWS, COLS, "cI", 300),
        ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = "2c"  # comma
            colSepChar = colSepHexString.decode("hex")
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = "0a"  # newline
            rowSepChar = rowSepHexString.decode("hex")
            print "rowSepChar:", rowSepChar

            csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(
                csvPathname,
                enumList,
                rowCount,
                colCount,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
                SEED=SEED_FOR_TRAIN,
            )

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(
                csvScorePathname,
                enumListForScore,
                rowCount,
                colCount,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
                SEED=SEED_FOR_SCORE,
            )
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(
                path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt
            )

            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            print "\n" + csvFilename
            (
                missingValuesDict,
                constantValuesDict,
                enumSizeDict,
                colTypeDict,
                colNameDict,
            ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True)

            y = colCount
            modelKey = "enums"
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            # use mtries so both look at all cols at every split? doesn't matter for speedrf
            # does speedrf try one more time? with 3 cols, mtries=2, so another try might
            # get a look at the missing col
            # does matter for drf2. does it "just stop"
            # trying mtries always looking at all columns or 1 col might be interesting
            if SPEEDRF:
                kwargs = {
                    "sample_rate": 0.999,
                    "destination_key": modelKey,
                    "response": y,
                    "ntrees": 1,
                    "max_depth": 100,
                    # 'oobee': 1,
                    "validation": hex_key,
                    # 'validation': scoreDataKey,
                    "seed": 123456789,
                    "mtries": COLS,
                }
            elif GBM:
                kwargs = {
                    "destination_key": modelKey,
                    "response": y,
                    "validation": scoreDataKey,
                    "seed": 123456789,
                    # 'learn_rate': .1,
                    "ntrees": 1,
                    "max_depth": 100,
                    "min_rows": 1,
                    "classification": 1,
                }
            else:
                kwargs = {
                    "sample_rate": 0.999,
                    "destination_key": modelKey,
                    "response": y,
                    "classification": 1,
                    "ntrees": 1,
                    "max_depth": 100,
                    "min_rows": 1,
                    "validation": hex_key,
                    # 'validation': scoreDataKey,
                    "seed": 123456789,
                    "nbins": 1024,
                    "mtries": COLS,
                }

            for r in range(2):
                start = time.time()

                if GBM:
                    gbmResult = h2o_cmd.runGBM(
                        parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs
                    )

                    print "gbm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                    # print h2o.dump_json(gbmResult)
                    (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult)

                elif SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(
                        parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs
                    )
                    print "speedrf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                    (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                else:
                    rfResult = h2o_cmd.runRF(
                        parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs
                    )
                    print "rf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                    (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                h2o_cmd.runScore(
                    dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL
                )  # , expectedAuc=0.5)

                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
Ejemplo n.º 33
0
    def test_RF_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        tryList = [
            (10000, 100, 'cA', 300),
            (10000, 300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            (10000, 3000, 'cK', 3600),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'RFModelKey'

            # Parse (train)****************************************
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0,
                                                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # RF(train iterate)****************************************
            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'nbins': 1024,
                    'classification': 1,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'response': 'C' + str(numCols-1),
                    'ignored_cols_by_name': None,
                    }

                print "Using these parameters for RF: ", params
                kwargs = params.copy()

                trainStart = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)
                rfResult["drf_model"] = rfResult.pop("speedrf_model")
                errsLast = rfResult['drf_model']['errs'][-1]
                print "RF 'errsLast'", errsLast

                cm = rfResult['drf_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_rf_big1_overwrite_model_fvec(self):
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        print "\n" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       hex_key=hex_key,
                                       timeoutSecs=15,
                                       schema='put')
        firstRfView = None
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)

            print "Change the number of trees, while keeping the rf model key name the same"
            print "Checks that we correctly overwrite previous rf model"
            if OVERWRITE_RF_MODEL:
                kwargs['ntrees'] = 1 + jobDispatch
            else:
                kwargs['ntrees'] = 1
                # don't change the seed if we're overwriting the model. It should get
                # different results just from changing the tree count
                kwargs['seed'] = random.randint(0, sys.maxint)

            kwargs["response"] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)]
            h2o_cmd.runSpeeDRF(node=randomNode,
                               parseResult=parseResult,
                               destination_key=model_key,
                               timeoutSecs=300,
                               noPoll=True,
                               **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['_dataKey'] = hex_key
            rfView['_key'] = model_key

            print "rf job dispatch end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            # we're going to compare rf results to previous as we go along (so we save rf view results
            h2o_jobs.pollWaitJobs(pattern='RF_model',
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)

            # In this test we're waiting after each one, so we can save the RFView results for comparison to future
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None,
                                                  model_key,
                                                  timeoutSecs=60)
            if firstRfView is None:  # we'll use this to compare the others
                firstRfView = rfViewResult.copy()
                firstModelKey = model_key
                print "firstRfView", h2o.dump_json(firstRfView)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult,
                                       firstRfView,
                                       vice_versa=True,
                                       with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)
                self.assertGreater(len(df.difference), 29,
                                   msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                                       (len(df.difference), h2o.dump_json(df.difference)))
Ejemplo n.º 35
0
    def test_RF_mnist_both(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename,
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern,
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            # print "This is the 'ignore=' we'll use"
            # no longer use. depend on h2o to get it right.
            ntree = 25
            params = {
                'response': 0,
                'ntrees': ntree,
                # 'data_key='mnist_training.csv.hex'
                'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 2147483647,
                'select_stat_type': 'ENTROPY',
                'sampling_strategy': 'RANDOM',
                'sample_rate': 0.67,
                'oobee': 1,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'destination_key': 'RF_model',
                'nbins': 1024,
                # 'seed': 784834182943470027,
               #  'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
                }

            if rfSeed is None:
                params['seed'] = random.randint(0,sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, 
                timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # RFView (score on test)****************************************
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            print "classification error is expected to be low because we included the test data in with the training!"
            self.assertAlmostEqual(classification_error, 2.84, delta=0.5, 
                msg="Classification error %s differs too much" % classification_error)
       
            treeStats = rfView['speedrf_model']['treeStats'] 
            leaves = {'min': treeStats['minLeaves'], 'mean': treeStats['meanLeaves'], 'max': treeStats['maxLeaves']}
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100
                d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {'min': treeStats['minDepth'], 'mean': treeStats['meanDepth'], 'max': treeStats['maxDepth']}
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100
                d = "seed: %s %s depth: %s expected: %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            modelKey = rfView['speedrf_model']['_key']
            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"
    
        for d in allDelta:
            print d
Ejemplo n.º 36
0
    def test_rf_predict3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 1:
            y = 4  # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            translate = None
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        else:
            y = 0  # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey + "=" + hex_key,
                          timeoutSecs=30)  # unneeded but interesting
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) +
                          "]",
                          timeoutSecs=30)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=model_key,
                data_key=hexKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time(
            ) - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex",
                                      csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey,
                                      csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(
                csvSrcOutputPathname,
                msg="Original",
                colIndex=0,
                translate=translate,
                skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput) = compare_csv_at_one_col(
                csvPredictPathname,
                msg="Predicted",
                colIndex=0,
                skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1 - skipSrcOutputHeader) !=
                (rowNum2 - skipPredictHeader)):
                raise Exception(
                    "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" %
                    (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput,
                                                predictOutput)):
                # if float(o)!=float(p):
                if str(o) != str(p):
                    if wrong == 10:
                        print "Not printing any more mismatches\n"
                    elif wrong < 10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s" % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 2.0:
                raise Exception(
                    "pctWrong too high. Expect < 2% error because it's reusing training data"
                )
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        kwargs = {
            'destination_key': 'rf_model',
            'response': response,
            'ntrees': trees,
            'classification': 1,
        }

        rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                      timeoutSecs=timeoutSecs,
                                      **kwargs)
        rfResult["drf_model"] = rfResult.pop("speedrf_model")
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y
        pctWrong = predict_and_compare_csvs(model_key='rf_model',
                                            hex_key=hexKey,
                                            translate=translate,
                                            y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now
        self.assertAlmostEqual(
            pctWrong,
            expectedPctWrong,
            delta=0.7,
            msg=
            "predicted pctWrong: %s should be small because we're predicting with training data"
            % pctWrong)
Ejemplo n.º 37
0
    def test_rf_covtype20x_fvec(self):
        importFolderPath = 'standard'

        if DO_SMALL:
            csvFilenameTrain = 'covtype.data'
            hex_key = 'covtype1x.data.A.hex'
        else:
            csvFilenameTrain = 'covtype20x.data'
            hex_key = 'covtype20x.data.A.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTrain
        parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key'])
        dataKeyTrain = parseResultTrain['destination_key']
        print "Parse end", dataKeyTrain

        # have to re import since source key is gone
        # we could just copy the key, but sometimes we change the test/train data  to covtype.data
        if DO_SMALL:
            csvFilenameTest = 'covtype.data'
            hex_key = 'covtype1x.data.B.hex'
            dataKeyTest2 = 'covtype1x.data.C.hex'
        else:
            csvFilenameTest = 'covtype20x.data'
            hex_key = 'covtype20x.data.B.hex'
            dataKeyTest2 = 'covtype20x.data.C.hex'

        csvPathname = importFolderPath + "/" + csvFilenameTest
        parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500)
        print "Parse result['destination_key']:", parseResultTest['destination_key']
        inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key'])
        dataKeyTest = parseResultTest['destination_key']
        print "Parse end", dataKeyTest

        # make a 3rd key so the predict is uncached too!
        execExpr = dataKeyTest2 + "=" + dataKeyTest
        kwargs = {'str': execExpr, 'timeoutSecs': 15}
        resultExec = h2o_cmd.runExec(**kwargs)

        # train
        # this does RFView to understand when RF completes, so the time reported for RFView here, should be 
        # considered the "first RFView" times..subsequent have some caching?. 
        # unless the no_confusion_matrix works

        # params is mutable. This is default.
        paramDict = drf2ParamDict
        params = {
            'ntrees': 20,
            'destination_key': 'RF_model',
            'response': "C55",
        }

        # colX = h2o_rf.pickRandRfParams(paramDict, params)

        kwargs = params.copy()
        timeoutSecs = 30 + kwargs['ntrees'] * 60

        start = time.time()
        h2o_cmd.runSpeeDRF(parseResult=parseResultTrain,
                           timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
        print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        print "\nRFView start after job completion"
        model_key = kwargs['destination_key']
        ntree = kwargs['ntrees']

        start = time.time()
        # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree)
        h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs)
        print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds'

        for trial in range(1):
            # scoring
            start = time.time()
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs)
            print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'
            rfView["drf_model"] = rfView.pop("speedrf_model")
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)
            self.assertAlmostEqual(classification_error, 50, delta=50,
                                   msg="Classification error %s differs too much" % classification_error)
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2)
            print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.'

            parseKey = parseResultTrain['destination_key']
            rfModelKey  = rfView['drf_model']['_key']
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=parseKey,
                model_key=rfModelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=parseKey,
                vactual='C55',
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Ejemplo n.º 38
0
    def test_rf_enums_mappings_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            # (n, 1, 'cD', 300), 
            # (n, 2, 'cE', 300), 
            # (n, 3, 'cF', 300), 
            # (n, 4, 'cG', 300), 
            # (n, 5, 'cH', 300), 
            # (n, 6, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, 
                timeoutSecs=30, separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            if SPEEDRF:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'num_trees': 1,
                    'max_depth': 100,
                    'oobee': 1,
                    'seed': 123456789,
                }
            else:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                }

            for r in range(4):
                start = time.time()
                
                if SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                
                print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
                # print h2o.dump_json(rfResult)
                (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)
                h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5)
                
                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
    def test_rf_big1_overwrite_model_fvec(self):
        h2o.beta_features = True
        csvFilename = 'hhp_107_01.data.gz'
        hex_key = csvFilename + ".hex"
        print "\n" + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename,
                                       hex_key=hex_key, timeoutSecs=15, schema='put')
        firstRfView = None
        # dispatch multiple jobs back to back
        for jobDispatch in range(3):
            start = time.time()
            kwargs = {}
            if OVERWRITE_RF_MODEL:
                print "Since we're overwriting here, we have to wait for each to complete noPoll=False"
                model_key = 'RF_model'
            else:
                model_key = 'RF_model' + str(jobDispatch)

            print "Change the number of trees, while keeping the rf model key name the same"
            print "Checks that we correctly overwrite previous rf model"
            if OVERWRITE_RF_MODEL:
                kwargs['num_trees'] = 1 + jobDispatch
            else:
                kwargs['num_trees'] = 1
                # don't change the seed if we're overwriting the model. It should get 
                # different results just from changing the tree count
                kwargs['seed'] = random.randint(0, sys.maxint)

            kwargs["response"] = "C107"

            # FIX! what model keys do these get?
            randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300,
                          noPoll=True, **kwargs)
            # FIX! are these already in there?
            rfView = {}
            rfView['_dataKey'] = hex_key
            rfView['_key'] = model_key

            print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            # we're going to compare rf results to previous as we go along (so we save rf view results
            h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # In this test we're waiting after each one, so we can save the RFView results for comparison to future
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['_dataKey']
            model_key = rfView['_key']
            print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)"
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            if firstRfView is None: # we'll use this to compare the others
                firstRfView = rfViewResult.copy()
                firstModelKey = model_key
                print "firstRfView", h2o.dump_json(firstRfView)
            else:
                print "Comparing", model_key, "to", firstModelKey
                df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)
                self.assertGreater(len(df.difference), 29,
                                   msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                                       (len(df.difference), h2o.dump_json(df.difference)))