Esempio n. 1
0
    def test_parse_rand_utf8(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        print "HACK: reduce rows to 10 for debug"
        tryList = [
            # do two cols to detect bad eol behavior
            (10, 2, 'cA', 120),
            (10, 2, 'cG', 120),
            (10, 2, 'cH', 120),
            ]

        print "What about messages to log (INFO) about unmatched quotes (before eol)"
        # got this ..trying to avoid for now
        # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "parseResult:", dump_json(parseResult)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

            assert len(missingList) == 0
            # FIX! check type?
        
            # print "inspect:", h2o.dump_json(inspect)
            self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount))
            self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
Esempio n. 2
0
    def test_parse_1m_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [(10, 65000, "cH", 30)]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            print "Summary should work with 65k"
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True
            )
            print csvFilename, "parse time:", parseResult["response"]["time"]
            print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect["num_cols"],
                colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount),
            )
            self.assertEqual(
                inspect["num_rows"],
                rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect["num_rows"], rowCount),
            )

            # we should obey max_column_display
            column_limits = [25, 25000, 50000]
            for column_limit in column_limits:
                inspect = h2o_cmd.runInspect(
                    None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs
                )
                self.assertEqual(
                    len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit)
                )
                for r in range(0, len(inspect["rows"])):
                    # NB: +1 below because each row includes a row header row: #{row}
                    self.assertEqual(
                        len(inspect["rows"][r]),
                        column_limit + 1,
                        "inspect data rows obeys max_column_display = " + str(column_limit),
                    )
Esempio n. 3
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        parityPl = h2o.find_file('syn_scripts/parity.pl')

# two row dataset gets this. Avoiding it for now
# java.lang.ArrayIndexOutOfBoundsException: 1
# at hex.rf.Data.sample_fair(Data.java:149)

        # always match the run below!
        print "\nAssuming two row dataset is illegal. avoiding"

        for x in xrange (10,100,10):
            shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split())
            # algorithm for creating the path and filename is hardwired in parity.pl.
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # FIX! we fail if min is 3
        for x in xrange (10,100,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)

            trees += 10
            timeoutSecs += 2
Esempio n. 4
0
    def test_sort_of_prostate_with_row_schmoo(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED = 
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"

        rowData = rand_rowData()
        write_syn_dataset(csvPathname, 1, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        for trial in range (100):

            rowData = rand_rowData()
            num = random.randint(1, 10096)
            append_syn_dataset(csvPathname, rowData, num)
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            key = csvFilename + "_" + str(trial)
            key2 = csvFilename + "_" + str(trial) + ".hex"
            key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2, 
                timeoutSecs=70, pollTimeoutSecs=60)
            print "trial #", trial, "with num rows:", num, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=key2)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Esempio n. 5
0
    def test_big_sum_fail(self):
        node = h2o.nodes[0]
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvPathname = SYNDATASETS_DIR + '/temp.csv'
        hex_key = 'temp.hex'
        for trial in range(5):
            # what about seed?
            cfResult = h2o.nodes[0].create_frame(key=hex_key,
                binary_ones_fraction=0.02, binary_fraction=0, randomize=1, 
                missing_fraction=0, integer_fraction=1, real_range=100,
                has_response=0, response_factors=2, factors=100, cols=1, 
                integer_range=100, value=0, categorical_fraction=0, rows=2.5e+08, 
                timeoutSecs=300)

            inspect = h2o_cmd.runInspect(key=hex_key)
            h2o_cmd.infoFromInspect(inspect, hex_key)

            if UNNECESSARY:
                # this is just doing a head to R. not critical
                h2e.exec_expr(execExpr="%s = %s" % (hex_key, hex_key))
                h2e.exec_expr(execExpr="Last.value.0 = %s[c(1,2,3,4,5,6),]" % hex_key)
                h2e.exec_expr(execExpr="Last.value.0 = Last.value.0")
                node.csv_download(src_key="Last.value.0", csvPathname=csvPathname)
                node.remove_key("Last.value.0")
                # not sure why this happened
                h2o_cmd.runStoreView(view=10000, offset=0)


            # Fails on this
            h2e.exec_expr(execExpr='Last.value.1 = %s[,1]' % hex_key)

            print "Trial #", trial, "completed"
Esempio n. 6
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in xrange (1,10,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put')

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
Esempio n. 7
0
    def test_C_RF_poker100(self):
        parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put')
        h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10)

        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 60
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            sys.stdout.write('.')
            sys.stdout.flush()
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='put')
            h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
            trees += 10
Esempio n. 8
0
    def test_factor_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # use SEED so the file isn't cached?
        csvFilenameAll = [
            ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random 1mx8 csv"
            write_syn_dataset(csvPathname, 1000000, SEEDPERFILE)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 6
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
Esempio n. 9
0
    def test_exec2_cbind_like_R(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()


        SEEDPERFILE = random.randint(0, sys.maxint)
        rowCount = 30000
        colCount = 150
        timeoutSecs = 60
        hex_key = "df"
        csvPathname = SYNDATASETS_DIR + "/" + "df.csv"
        write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
        parseResult = h2i.import_parse(path=csvPathname, schema='put', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)

        colCount = 1
        hex_key = "indx"
        csvPathname = SYNDATASETS_DIR + "/" + "indx.csv"
        write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

        parseResult = h2i.import_parse(path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)

        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']

        for trial in range(10):
            for execExpr in exprList:
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'

        h2o.check_sandbox_for_errors()
Esempio n. 10
0
    def test_parse_bad_30rows_fvec(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvPathname = SYNDATASETS_DIR + "/bad.data"
        dsf = open(csvPathname, "w+")
        dsf.write(datalines)
        dsf.close()

        for i in range(20):
            # every other one
            single_quotes = 1

            # force header=1 to make it not fail (doesn't deduce correctly)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", single_quotes=single_quotes, header=1, hex_key="trial" + str(i) + ".hex"
            )
            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"])
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            self.assertEqual(numCols, 4, "Parsed wrong number of cols: %s" % numCols)
            self.assertNotEqual(
                numRows,
                30,
                "Parsed wrong number of rows. Should be 29.\
                 Didn't deduce header?: %s"
                % numRows,
            )
            self.assertEqual(numRows, 29, "Parsed wrong number of rows: %s" % numRows)
Esempio n. 11
0
    def test_parse_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, 'cA', 60),
            (100, 6000, 'cB', 60),
            (100, 7000, 'cC', 60),
            (100, 8000, 'cD', 60),
            (100, 8200, 'cE', 60),
            (100, 8500, 'cF', 60),
            (100, 9000, 'cG', 60),
            (100, 10000, 'cI', 60),
            (100, 11000, 'cH', 60),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 12
0
    def test_many_cols_01(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5000, 'cA', 5),
            (100, 10000, 'cI', 5),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=120, 
                doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=120)
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 13
0
    def test_sort_of_prostate_with_row_schmoo(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"

        rowData = rand_rowData()
        totalRows = 1000000
        write_syn_dataset(csvPathname, totalRows, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        # used to fail around 50 iterations..python memory problem
        for trial in range (40):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, rowData, num)
            totalRows += num
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=150, pollTimeoutSecs=150)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Esempio n. 14
0
    def test_rf_1ktrees_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [500]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Esempio n. 15
0
    def test_rf_float_rand2_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 10000
        write_syn_dataset(csvPathname, totalRows, headerData)

        for trial in range (5):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, num)
            totalRows += num
            start = time.time()

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            kwargs = {'ntrees': 5, 'max_depth': 5}
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=60, pollTimeoutSecs=60, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Esempio n. 16
0
    def test_KMeans_constant_col_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 10),
            (100, 10, 'cB', 10),
            (100, 9, 'cC', 10),
            (100, 8, 'cD', 10),
            (100, 7, 'cE', 10),
            (100, 6, 'cF', 10),
            (100, 5, 'cG', 10),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print "Generate synthetic dataset with first column constant = 0 and see what KMeans does"
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseResult['destination_key']

            kwargs = {'k': 2, 'initialization': 'Furthest', 'destination_key': 'benign_k.hex', 'max_iter': 25}
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

            # check center list (first center) has same number of cols as source data
            self.assertEqual(colCount, len(centers[0]),
                "kmeans first center doesn't have same # of values as dataset row %s %s" % (colCount, len(centers[0])))
Esempio n. 17
0
    def test_many_rows_long_enums(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000000, 1, 'cA', 5),
            (1000000, 1, 'cA', 5),
            ]

        # h2b.browseTheCloud()

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            SEPARATOR = ord(',')
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=300, 
                header=0, separator=SEPARATOR) # don't force header..we have NAs in the rows, and NAs mess up headers
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname)
            numCols = inspect['numCols']
            numRows = inspect['numRows']

            print "\n" + csvFilename
Esempio n. 18
0
    def test_many_cols_and_values_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 10, 'cA', 30),
            (100, 1000, 'cB', 30),
            # (100, 900, 'cC', 30),
            # (100, 500, 'cD', 30),
            # (100, 100, 'cE', 30),
            ]
        
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            for sel in range(48): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                selKey2 = key2 + "_" + str(sel)
                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs)
                print csvFilename, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
                print "\n" + csvFilename
Esempio n. 19
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cI', 5),
            (100, 5000, 'cA', 5),
            (100, 6000, 'cB', 5),
            (100, 7000, 'cC', 5),
            (100, 8000, 'cD', 5),
            (100, 8200, 'cE', 5),
            (100, 8500, 'cF', 5),
            (100, 9000, 'cG', 5),
            (100, 11000, 'cH', 5),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
    def test_1ktrees_job_cancel_many_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range (1,5):
            # random 0 or 1 delay
            delay = random.uniform(0,1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3)
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
Esempio n. 21
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = (
                "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR
            )
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 3):
            sys.stdout.write(".")
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30
            )

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3)
            print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds"

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Esempio n. 22
0
    def test_sort_of_prostate_with_row_schmoo(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        rowData = "1,0,65,1,2,1,1.4,0,6"

        write_syn_dataset(csvPathname,      99860, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        print "Updating the key and key2 names for each trial"
        for trial in range (200):
            append_syn_dataset(csvPathname, rowData)
            ### start = time.time()
            # this was useful to cause failures early on. Not needed eventually
            ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv"))
            ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds'

            start = time.time()
            key = csvFilename + "_" + str(trial)
            key2 = csvFilename + "_" + str(trial) + ".hex"
            key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2)
            print "trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'

            h2o_cmd.runInspect(key=key2)
            # only used this for debug to look at parse (red last row) on failure
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Esempio n. 23
0
    def test_parse_specific_case3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        hex_key = "a.hex"

        for (dataset, expNumRows, expNumCols, expNaCnt, expType, unicodeNum) in tryList:
            csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, dataset)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
                hex_key=hex_key, timeoutSecs=10, doSummary=False)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
            
            print "Parsed with special unichr(%s) which is %s:" % (unicodeNum, unichr(unicodeNum))
            print "inspect:", h2o.dump_json(inspect)
            numRows = inspect['numRows']
            self.assertEqual(numRows, expNumRows, msg='Using unichr(0x%x) Wrong numRows: %s Expected: %s' % \
                (unicodeNum, numRows, expNumRows))
            numCols = inspect['numCols']
            self.assertEqual(numCols, expNumCols, msg='Using unichr(0x%x) Wrong numCols: %s Expected: %s' % \
                (unicodeNum, numCols, expNumCols))

            # this is required for the test setup
            assert(len(expNaCnt)>=expNumCols)
            assert(len(expType)>=expNumCols)

            for k in range(expNumCols):
                naCnt = inspect['cols'][k]['naCnt']
                self.assertEqual(expNaCnt[k], naCnt, msg='Using unichr(0x%x) col: %s naCnt: %d should be: %s' % \
                    (unicodeNum, k, naCnt, expNaCnt[k]))
                stype = inspect['cols'][k]['type']
                self.assertEqual(expType[k], stype, msg='Using unichr(0x%x) col: %s type: %s should be: %s' % \
                    (unicodeNum, k, stype, expType[k]))
Esempio n. 24
0
    def test_many_cols_and_types(self):
        SEED = random.randint(0, sys.maxint)
        print "\nUsing random seed:", SEED
        # SEED =
        random.seed(SEED)
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5, "cA", 5),
            (1000, 59, "cB", 5),
            (5000, 128, "cC", 5),
            (6000, 507, "cD", 5),
            (9000, 663, "cE", 5),
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]
            inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            print "\n" + csvFilename
Esempio n. 25
0
    def test_many_cols_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 11, 'cA', 5),
            (100, 10, 'cB', 5),
            (100, 9, 'cC', 5),
            (100, 8, 'cD', 5),
            (100, 7, 'cE', 5),
            (100, 6, 'cF', 5),
            (100, 5, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")
            print "Parse result['destination_key']:", parseKey['destination_key']

            kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'}
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
Esempio n. 26
0
    def test_kmeans_sphere3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = 'syn_spheres3_' + str(SEED) + '.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        write_syn_dataset(csvPathname, 1000000, SEED)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex")

        kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex'}
        timeoutSecs = 30
        start = time.time()
        kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
        elapsed = time.time() - start
        print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
        # cluster centers can return in any order
        centersSorted = sorted(centers, key=itemgetter(0))

        self.assertAlmostEqual(centersSorted[0][0],100,delta=.2)
        self.assertAlmostEqual(centersSorted[1][0],200,delta=.2)
        self.assertAlmostEqual(centersSorted[2][0],300,delta=.2)

        self.assertAlmostEqual(centersSorted[0][1],100,delta=.2)
        self.assertAlmostEqual(centersSorted[1][1],200,delta=.2)
        self.assertAlmostEqual(centersSorted[2][1],300,delta=.2)

        self.assertAlmostEqual(centersSorted[0][2],100,delta=.2)
        self.assertAlmostEqual(centersSorted[1][2],200,delta=.2)
        self.assertAlmostEqual(centersSorted[2][2],300,delta=.2)

        show_results(csvPathname, parseKey, model_key, centers, 'd')
Esempio n. 27
0
    def test_parse_bad_30rows_fvec(self):
        # h2b.browseTheCloud()
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvPathname = SYNDATASETS_DIR + '/bad.data'
        dsf = open(csvPathname, "w+")
        dsf.write(datalines)
        dsf.close()

        for i in range(20):
            # every other one
            single_quotes = 1

            parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes,
                hex_key="trial" + str(i) + ".hex")
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 4, "Parsed wrong number of cols: %s" % numCols)
            self.assertNotEqual(numRows, 30, "Parsed wrong number of rows. Should be 29.\
                 Didn't deduce header?: %s" % numRows)
            self.assertEqual(numRows, 29, "Parsed wrong number of rows: %s" % numRows)
Esempio n. 28
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            key2 = csvFilename + "_" + str(trial) + ".hex"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Esempio n. 29
0
    def test_parse_fs_schmoo_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        # rowData = "1,0,65,1,2,1,1.4,0,6"
        rowData = "1,0,65,1,2,1,1,0,6"

        totalRows = 99860
        write_syn_dataset(csvPathname, totalRows, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        print "Updating the key and hex_key names for each trial"
        for trial in range (200):
            append_syn_dataset(csvPathname, rowData)
            totalRows += 1

            start = time.time()
            key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "trial #", trial, "totalRows:", totalRows, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'

            h2o_cmd.runInspect(key=hex_key)
            # only used this for debug to look at parse (red last row) on failure
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Esempio n. 30
0
    def test_D_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        for x in xrange (11,100,10):
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to 
            # wait for the last one to be gen'ed here before we start the first below.
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        trees = 6
        timeoutSecs = 20
        # always match the gen above!
        # reduce to get intermittent failures to lessen, for now
        for x in xrange (11,60,10):
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            # FIX! TBD do we always have to kick off the run from node 0?
            # what if we do another node?
            # FIX! do we need or want a random delay here?
            h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs,
                    csvPathname=csvPathname)
            trees += 10
            sys.stdout.write('.')
            sys.stdout.flush()
    def test_parse_csv_download_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"

        rowData = rand_rowData()
        totalRows = 1000000
        write_syn_dataset(csvPathname, totalRows, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        # failed around 50 trials..python memory problem
        for trial in range(5):
            rowData = rand_rowData()
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, rowData, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "\nA trial #", trial, "totalRows:", totalRows, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname)
            numColsA = inspect['numCols']
            numRowsA = inspect['numRows']
            byteSizeA = inspect['byteSize']

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key,
                                      csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "B trial #", trial, "totalRows:", totalRows, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname)
            numColsB = inspect['numCols']
            numRowsB = inspect['numRows']
            byteSizeB = inspect['byteSize']

            self.assertEqual(
                missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result")
            self.assertEqual(
                numRowsA, numRowsB,
                "numRows mismatches after re-parse of downloadCsv result")
            # self.assertEqual(byteSizeA, byteSizeB,
            #    "byteSize mismatches after re-parse of downloadCsv result %s %s" % (byteSizeA, byteSizeB))

            h2o.check_sandbox_for_errors()
Esempio n. 32
0
    def test_rf_float_rand_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 1000
        colCount = 7
        write_syn_dataset(csvPathname, totalRows, colCount, headerData)

        for trial in range(5):
            # grow the data set
            rowData = rand_rowData(colCount)
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, colCount, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           doSummary=True)

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   timeoutSecs=15,
                                   pollTimeoutSecs=5,
                                   **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            inspect = h2o_cmd.runInspect(key=hex_key)
            cols = inspect['cols']
            numCols = inspect['numCols']
            for i, c in enumerate(cols):
                if i < (
                        numCols - 1
                ):  # everything except the last col (output) should be 8 byte float
                    colType = c['type']
                    self.assertEqual(colType,
                                     'Real',
                                     msg="col %d should be type Real: %s" %
                                     (i, colType))

            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
    def test_summary2_percentile2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values
            (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values
            (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            legalValues = {}
            for x in range(expectedMin, expectedMax):
                legalValues[x] = x
        
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1)
            if h2o.verbose:
                print "summaryResult:", h2o.dump_json(summaryResult)

            summaries = summaryResult['summaries']
            scipyCol = 0
            for column in summaries:
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    e = .1 * rowCount
                    self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, 
                        msg="Bins not right. b: %s e: %s" % (b, e))

                print "pctile:", pctile
                print "maxs:", maxs
                self.assertEqual(maxs[0], expectedMax)
                print "mins:", mins
                self.assertEqual(mins[0], expectedMin)

                for v in pctile:
                    self.assertTrue(v >= expectedMin, 
                        "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin))
                    self.assertTrue(v <= expectedMax, 
                        "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax))
            
                eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0]
                if expectedMin==1:
                    eV = eV1
                elif expectedMin==0:
                    eV = [e-1 for e in eV1]
                elif expectedMin==2:
                    eV = [e+1 for e in eV1]
                else:
                    raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin)

            trial += 1

            # if colname!='' and expected[scipyCol]:
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    )
            scipyCol += 1
Esempio n. 34
0
    def test_GLM2_mnist(self):
        if not SCIPY_INSTALLED:
            pass

        else:
            h2o.beta_features = True
            SYNDATASETS_DIR = h2o.make_syn_dir()

            csvFilelist = [
                (10000, 500, 'cA', 60),
            ]

            trial = 0
            for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist:
                trialStart = time.time()

                # PARSE test****************************************
                csvFilename = 'syn_' + "binary" + "_" + str(
                    rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # GLM****************************************
                modelKey = 'GLM_model'
                y = colCount
                kwargs = {
                    'response': 'C' + str(y + 1),
                    'family': 'binomial',
                    'lambda': 1e-4,
                    'alpha': 0,
                    'max_iter': 15,
                    'n_folds': 1,
                    'beta_epsilon': 1.0E-4,
                    'destination_key': modelKey,
                }

                # GLM wants the output col to be strictly 0,1 integer
                execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (
                    hex_key, y + 1, y + 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                aHack = {'destination_key': 'aHack'}

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=60,
                                     **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                lambdaMax = glm['glm_model']['lambda_max']
                print "lambdaMax:", lambdaMax

                best_threshold = glm['glm_model']['submodels'][0][
                    'validation']['best_threshold']
                print "best_threshold", best_threshold

                # pick the middle one?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][
                    '_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                print "\nPredict\n==========\n"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='aHack',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='aHack',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 50, "Should see less than 50% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Esempio n. 35
0
    def test_parse_time(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = None
        colCount = 6
        rowCount = 1000
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range(20):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListA = h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "missingValuesListA", missingValuesListA

            num_colsA = inspect['num_cols']
            num_rowsA = inspect['num_rows']
            row_sizeA = inspect['row_size']
            value_size_bytesA = inspect['value_size_bytes']

            self.assertEqual(missingValuesListA, [],
                             "missingValuesList should be empty")
            self.assertEqual(num_colsA, colCount)
            self.assertEqual(num_rowsA, rowCount)

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key,
                                      csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            # interesting. what happens when we do csv download with time data?
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesListB = h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "missingValuesListB", missingValuesListB

            num_colsB = inspect['num_cols']
            num_rowsB = inspect['num_rows']
            row_sizeB = inspect['row_size']
            value_size_bytesB = inspect['value_size_bytes']

            self.assertEqual(
                missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                num_colsA, num_colsB,
                "num_cols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # so I guess that's okay. So allow for an extra row here.
            self.assertEqual(
                num_rowsA, num_rowsB,
                "num_rowsA: %s num_rowsB: %s mismatch after re-parse of downloadCsv result"
                % (num_rowsA, num_rowsB))
            print "H2O writes the internal format (number) out for time."
            print "So don't do the row_size and value_size comparisons."

            # ==> syn_time.csv <==
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30
            # 31-Oct-49, 25-NOV-10, 08-MAR-44, 23-Nov-34, 19-Feb-96, 23-JUN-30

            # ==> csvDownload.csv <==
            # "0","1","2","3","4","5"
            # 2.5219584E12,1.293264E12,2.3437116E12,2.0504736E12,3.9829788E12,1.9110204E12

            if 1 == 0:
                # extra line for column headers?
                self.assertEqual(
                    row_sizeA, row_sizeB,
                    "row_size wrong after re-parse of downloadCsv result %d %d"
                    % (row_sizeA, row_sizeB))
                self.assertEqual(
                    value_size_bytesA, value_size_bytesB,
                    "value_size_bytes mismatches after re-parse of downloadCsv result %d %d"
                    % (value_size_bytesA, value_size_bytesB))

            # FIX! should do some comparison of values?
            # maybe can use exec to checksum the columns and compare column list.
            # or compare to expected values? (what are the expected values for the number for time inside h2o?)

            # FIX! should compare the results of the two parses. The infoFromInspect result?
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Esempio n. 36
0
    def test_kmeans_predict3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 1:
            outputClasses = 3
            y = 4  # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # Huh...now we apparently need the translate. Used to be:
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            #  translate = None
            translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1 == 0:
            outputClasses = 6
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            outputClasses = 6
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            outputClasses = 6
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        else:
            outputClasses = 10
            y = 0  # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        def predict_and_compare_csvs(model_key,
                                     hex_key,
                                     predictHexKey,
                                     translate=None,
                                     y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            # FIX! apparently we lose the enum mapping when we slice out, and then csv download? we just get the number?
            # OH NO..it looks like we actually preserve the enum..it's in the csv downloaded
            # the prediction is the one that doesn't have it, because it's realated to clusters, which have no
            # notion of output classes
            h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) +
                          "]",
                          timeoutSecs=30)

            start = time.time()
            predictResult = h2o.nodes[0].generate_predictions(
                model_key=model_key,
                data_key=hexKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time(
            ) - start, 'seconds'
            print "predictResult:", h2o.dump_json(predictResult)

            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex",
                                      csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey,
                                      csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(
                csvSrcOutputPathname,
                msg="Original",
                colIndex=0,
                translate=translate,
                skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput) = compare_csv_at_one_col(
                csvPredictPathname,
                msg="Predicted",
                colIndex=0,
                skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1 - skipSrcOutputHeader) !=
                (rowNum2 - skipPredictHeader)):
                raise Exception(
                    "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" %
                    (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput,
                                                predictOutput)):
                # if float(o)!=float(p):
                if str(o) != str(p):
                    if wrong == 10:
                        print "Not printing any more mismatches\n"
                    elif wrong < 10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s" % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?

            # hack..need to fix this
            if 1 == 0:
                if pctWrong > 2.0:
                    raise Exception(
                        "pctWrong too high. Expect < 2% error because it's reusing training data"
                    )
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        numCols = inspect["numCols"]
        numRows = inspect["numRows"]

        seed = random.randint(0, sys.maxint)
        # should pass seed
        # want to ignore the response col? we compare that to predicted

        # if we tell kmeans to ignore a column here, and then use the model on the same dataset to predict
        # does the column get ignored? (this is last col, trickier if first col. (are the centers "right"
        kwargs = {
            'ignored_cols_by_name': response,
            'seed': seed,
            # "seed": 4294494033083512223,
            'k': outputClasses,
            'initialization': 'PlusPlus',
            'destination_key': 'kmeans_model',
            'max_iter': 1000
        }

        kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                   timeoutSecs=60,
                                   **kwargs)
        # this is what the size of each cluster was, when reported by training
        size = kmeans['model']['size']

        # tupleResultList is created like this: ( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) )
        # THIS DOES A PREDICT in it (we used to have to do the predict to get more training result info?)
        (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
            self, kmeans, csvPathname, parseResult, 'd', **kwargs)

        # the tupleResultList has the size during predict? compare it to the sizes during training
        # I assume they're in the same order.
        size2 = [t[1] for t in tupleResultList]
        if size != size2:
            raise Exception(
                "training cluster sizes: %s are not the same as what we got from predict on same data: %s",
                (size, size2))

        # hack...hardwire for iris here
        # keep this with sizes sorted
        expectedSizes = [
            [39, 50, 61],
            [38, 50, 62],
        ]
        sortedSize = sorted(size)
        if sortedSize not in expectedSizes:
            raise Exception(
                "I got cluster sizes %s but expected one of these: %s " %
                (sortedSize, expectedSizes))

        # check center list (first center) has same number of cols as source data
        print "centers:", centers

        # we said to ignore the output so subtract one from expected
        self.assertEqual(
            numCols - 1, len(centers[0]),
            "kmeans first center doesn't have same # of values as dataset row %s %s"
            % (numCols - 1, len(centers[0])))
        # FIX! add expected
        # h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)

        error = kmeans['model']['total_within_SS']
        within_cluster_variances = kmeans['model']['within_cluster_variances']
        print "within_cluster_variances:", within_cluster_variances

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y  # zero-based index matches response col name

        print ""
        print "oh I see why I can't compare predict to actual, in kmeans"
        print "the cluster order doesn't have to match the output class enum order"
        print "so I don't know what cluster, each output class will be (kmeans)"
        print "all I can say is that the prediction distribution should match the original source distribution"
        print "have to figure out what to do"
        predictHexKey = 'predict_0.hex'
        pctWrong = predict_and_compare_csvs(model_key='kmeans_model',
                                            hex_key=hexKey,
                                            predictHexKey=predictHexKey,
                                            translate=translate,
                                            y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now

        # HACK ignoring error for now
        if 1 == 0:
            self.assertAlmostEqual(
                pctWrong,
                expectedPctWrong,
                delta=0.7,
                msg=
                "predicted pctWrong: %s should be small because we're predicting with training data"
                % pctWrong)
Esempio n. 37
0
 def setUpClass(cls):
     h2o.init()
     global SYNDATASETS_DIR
     SYNDATASETS_DIR = h2o.make_syn_dir()
Esempio n. 38
0
    def test_parse_200k_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 200, 200),
            (10, 1000, 'cB', 200, 200),
            (10, 1000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # import it N times and compare the N hex keys
            REPEAT = 5
            for i in range(REPEAT):
                hex_key_i = hex_key + "_" + str(i)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key_i,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
                print "Parse:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"

                # We should be able to see the parse result?
                start = time.time()
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             timeoutSecs=timeoutSecs2)
                print "Inspect:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"
                h2o_cmd.infoFromInspect(inspect, csvPathname)
                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(inspect['numRows']), \
                    "    numCols:", "{:,}".format(inspect['numCols'])

                # should match # of cols in header or ??
                self.assertEqual(
                    inspect['numCols'], colCount,
                    "parse created result with the wrong number of cols %s %s"
                    % (inspect['numCols'], colCount))
                self.assertEqual(inspect['numRows'], rowCount,
                    "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                    (inspect['numRows'], rowCount))

            # compare each to 0
            for i in range(1, REPEAT):
                hex_key_i = hex_key + "_" + str(i)
                hex_key_0 = hex_key + "_0"

                print "\nComparing %s to %s" % (hex_key_i, hex_key_0)
                if 1 == 0:
                    execExpr = "%s[1,]+%s[1,]" % (hex_key_0, hex_key_i)
                    resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                       timeoutSecs=30)
                    execExpr = "%s[,1]+%s[,1]" % (hex_key_0, hex_key_i)
                    resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                       timeoutSecs=30)

                execExpr = "%s+%s" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "%s!=%s" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "%s==%s" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "sum(%s==%s)" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "s=sum(%s==%s)" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)

                execExpr = "s=c(1); s=sum(%s==%s)" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "n=c(1); n=nrow(%s)*ncol(%s))" % (hex_key,
                                                             hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "r=c(1); r=s==n"
                resultExec, result, h2e.exec_expr(execExpr=execExpr,
                                                  timeoutSecs=30)
                print "result:", result
Esempio n. 39
0
    def test_GLM_convergence_2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 1, 'cD', 300),
            # (100, 100, 'cE', 300),
            # (100, 200, 'cF', 300),
            # (100, 300, 'cG', 300),
            # (100, 400, 'cH', 300),
            # (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        USEKNOWNFAILURE = False
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE, rowCount,
                                                colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            if USEKNOWNFAILURE:
                csvFilename = 'failtoconverge_100x50.csv'
                csvPathname = 'logreg/' + csvFilename

            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           schema='put')
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'max_iter': 40,
                'lambda': 1e0,
                'alpha': 0.5,
                'weight': 1.0,
                'link': 'familyDefault',
                'n_folds': 0,
                'beta_epsilon': 1e-4,
                'thresholds': '0:1:0.01',
            }

            if USEKNOWNFAILURE:
                kwargs['y'] = 50
            else:
                kwargs['y'] = y

            emsg = None
            for i in range(3):
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time(
                ) - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can
                # redo it in the browser for comparison
                (warnings, coefficients,
                 intercept) = h2o_glm.simpleCheckGLM(self,
                                                     glm,
                                                     None,
                                                     allowFailWarning=True,
                                                     **kwargs)

                if 1 == 0:
                    print "\n", "\ncoefficients in col order:"
                    # since we're loading the x50 file all the time..the real colCount
                    # should be 50 (0 to 49)
                    if USEKNOWNFAILURE:
                        showCols = 50
                    else:
                        showCols = colCount
                    for c in range(showCols):
                        print "%s:\t%s" % (c, coefficients[c])
                    print "intercept:\t", intercept

                # gets the failed to converge, here, after we see it in the browser too
                x = re.compile("[Ff]ailed")
                if warnings:
                    print "warnings:", warnings
                    for w in warnings:
                        print "w:", w
                        if (re.search(x, w)):
                            # first
                            if emsg is None: emsg = w
                            print w
                if emsg: break

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(5)

            # gets the failed to converge, here, after we see it in the browser too
            if emsg is not None:
                raise Exception(emsg)
Esempio n. 40
0
    def test_kmeans_sphere5(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        CLUSTERS = 5
        SPHERE_PTS = 10000
        csvFilename = 'syn_spheres100.csv'
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS)

        print "\nStarting", csvFilename
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                     key2=csvFilename + ".hex")

        # try 5 times, to see if all inits by h2o are good
        for trial in range(5):
            kwargs = {
                'k': CLUSTERS,
                'epsilon': 1e-6,
                'cols': None,
                'destination_key': 'syn_spheres100.hex'
            }
            timeoutSecs = 30
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex')

            ### print h2o.dump_json(kmeans)
            ### print h2o.dump_json(kmeansResult)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            # cluster centers can return in any order
            clusters = kmeansResult['KMeansModel']['clusters']
            clustersSorted = sorted(clusters, key=itemgetter(0))
            ### print clustersSorted

            print "\nh2o result, centers sorted"
            print clustersSorted
            print "\ngenerated centers"
            print centersList
            for i, center in enumerate(centersList):
                a = center
                b = clustersSorted[i]
                print "\nexpected:", a
                print "h2o:", b  # h2o result
                aStr = ",".join(map(str, a))
                bStr = ",".join(map(str, b))
                iStr = str(i)
                self.assertAlmostEqual(a[0],
                                       b[0],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " x not correct.")
                self.assertAlmostEqual(a[1],
                                       b[1],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " y not correct.")
                self.assertAlmostEqual(a[2],
                                       b[2],
                                       delta=1,
                                       msg=aStr + "!=" + bStr +
                                       ". Sorted cluster center " + iStr +
                                       " z not correct.")

            print "Trial #", trial, "completed"
Esempio n. 41
0
    def test_rf_log_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 100, 'cA', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            # CREATE test dataset******************************************************
            csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            testParseResult = h2i.import_parse(path=csvPathname,
                                               hex_key=hex_key,
                                               schema='put',
                                               timeoutSecs=10)
            print "Test Parse result['destination_key']:", testParseResult[
                'destination_key']
            dataKeyTest = testParseResult['destination_key']

            # CREATE train dataset******************************************************
            csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            trainParseResult = h2i.import_parse(path=csvPathname,
                                                hex_key=hex_key,
                                                schema='put',
                                                timeoutSecs=10)
            print "Train Parse result['destination_key']:", trainParseResult[
                'destination_key']
            dataKeyTrain = trainParseResult['destination_key']

            # RF train******************************************************
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # do oobe
            kwargs['response'] = "C" + str(colCount + 1)

            rfv = h2o_cmd.runRF(parseResult=trainParseResult,
                                timeoutSecs=timeoutSecs,
                                **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees)
            oobeTrainPctRight = 100.0 - classification_error
            expectTrainPctRight = 94
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\
                msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5)

            # RF score******************************************************
            print "Now score with the 2nd random dataset"
            rfv = h2o_cmd.runRFView(data_key=dataKeyTest,
                                    model_key=model_key,
                                    timeoutSecs=timeoutSecs,
                                    retryDelaySecs=1)

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees)
            self.assertAlmostEqual(
                classification_error,
                5.0,
                delta=2.0,
                msg="Classification error %s differs too much" %
                classification_error)
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=dataKeyTest)

            fullScorePctRight = 100.0 - classification_error
            expectScorePctRight = 94
            self.assertAlmostEqual(
                fullScorePctRight,
                expectScorePctRight,
                msg="Full: pct. right for scoring not close enough %6.2f %6.2f"
                % (fullScorePctRight, expectScorePctRight),
                delta=5)
Esempio n. 42
0
    def test_PCA_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 10, 'cA', 300),
            (10000, 50, 'cB', 300),
            (10000, 100, 'cC', 300),
            # (10000, 500, 'cH', 300),
            # (10000, 1000, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print(rowCount, colCount, hex_key, timeoutSecs)
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            h2o.beta_features = False  #turn off beta_features
            start = time.time()

            #h2o.beta_features = False
            modelKey = 'PCAModelKey'
            scoreKey = 'PCAScoreKey'

            # Parse ****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseResult = h2i.import_parse(bucket=None,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           noPoll=h2o.beta_features,
                                           doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseResult['destination_key'] for h2o"
                parseResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # PCA(tolerance iterate)****************************************
            h2o.beta_features = True
            for tolerance in [i / 10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                kwargs = params.copy()
                h2o.beta_features = True
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                h2o_cmd.runPCA(parseResult=parseResult,
                               timeoutSecs=timeoutSecs,
                               noPoll=True,
                               **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300,
                                 pollTimeoutSecs=120,
                                 retryDelaySecs=2)
                elapsed = time.time() - start
                PCAResult['python_elapsed'] = elapsed
                PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])

                pcaView = h2o_cmd.runPCAView(modelKey=modelKey)
                h2o_pca.simpleCheckPCA(self, pcaView)
                h2o_pca.resultsCheckPCA(self, pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)

                #h2o.beta_features = True
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"]
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
                #h2o.beta_features=False
                print
                print
                print
                num_pc = pcaInspect['pca_model']['num_pc']
                print "The number of standard deviations obtained: ", num_pc
                print
                print
                print

                if DO_PCA_SCORE:
                    # just score with same data
                    score_params = {
                        'destination_key': scoreKey,
                        'model': modelKey,
                        'num_pc': num_pc,
                        'source': hex_key,
                    }
                    kwargs = score_params.copy()
                    pcaScoreResult = h2o.nodes[0].pca_score(
                        timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                    h2j.pollWaitJobs(timeoutSecs=300,
                                     pollTimeoutSecs=120,
                                     retryDelaySecs=2)
                    print "PCAScore completed in", pcaScoreResult[
                        'python_elapsed'], "seconds. On dataset: ", csvPathname
                    print "Elapsed time was ", pcaScoreResult[
                        'python_%timeout'], "% of the timeout"

                    # Logging to a benchmark file
                    algo = "PCAScore " + " num_pc=" + str(
                        score_params['num_pc'])
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                        csvFilename, pcaScoreResult['python_elapsed'])
                    print l
                    h2o.cloudPerfH2O.message(l)
    def test_rf_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        REPL = 3
        tryList = [
            # summary fails with 100000 cols
            # (10, 50, 2, 'cA', 600),
            # pass
            # (2, 50, 50, 'cA', 600),
            # (2, 100, 50, 'cA', 600),
            (REPL, 200, 50, 'cA', 600),
            (REPL, 300, 50, 'cA', 600),
            (REPL, 350, 50, 'cA', 600),
            (REPL, 375, 50, 'cB', 600),
            # fail
            (REPL, 500, 300, 'cC', 600),
            (REPL, 500, 400, 'cD', 600),
            (REPL, 500, 500, 'cE', 600),
            (10, 50, 1600, 'cF', 600),
            (10, 50, 3200, 'cG', 600),
            (10, 50, 5000, 'cH', 600),
            # at 6000, it gets connection reset on the parse on ec2
            # (6000, 50, 5000, 'cG', 600),
            # (7000, 50, 5000, 'cH', 600),
        ]

        ### h2b.browseTheCloud()

        paramDict = {
            'ntrees': 10,
            'destination_key': 'model_keyA',
            'max_depth': 10,
            'nbins': 100,
            'sample_rate': 0.80,
        }

        trial = 0
        for (FILEREPL, rowCount, colCount, hex_key, timeoutSecs) in tryList:
            trial += 1

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = make_datasetgz_and_parse(SYNDATASETS_DIR,
                                                   csvFilename, hex_key,
                                                   rowCount, colCount,
                                                   FILEREPL, SEEDPERFILE,
                                                   timeoutSecs)

            if DO_RF:
                paramDict['response'] = 'C' + str(colCount)
                paramDict['mtries'] = 2
                paramDict['seed'] = random.randint(0, sys.maxint)
                kwargs = paramDict.copy()

                start = time.time()
                rfView = h2o_cmd.runRF(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)
                elapsed = time.time() - start
                print "RF end on ", parseResult['destination_key'], 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

                (classification_error, classErrorPctList,
                 totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

                algo = "RF "
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs. trees: {:d} Error: {:6.2f} \
                    numRows: {:d} numCols: {:d} byteSize: {:d}'                                                               .format(
                    len(h2o.nodes), tryHeap, algo, parseResult['destination_key'], elapsed, kwargs['ntrees'], \
                    classification_error, parseResult['numRows'], parseResult['numCols'], parseResult['byteSize'])
                print l
                h2o.cloudPerfH2O.message(l)

            print "Trial #", trial, "completed"
Esempio n. 44
0
    def test_summary2_uniform_int_w_NA(self):
        h2o.beta_features = False
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'B.hex', 1, 1000 * M,
             ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)),
            (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0,
                                         1000.0)),
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0,
                                          20000.0)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0,
                                          -1250.0, 0)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0,
                                                 50000.0, 100000.0)),

            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0,
                                          10000.0)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0,
                                           100.0)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0,
                                           75001.0, 100000.0)),
            # (ROWS, 1, 'C.hex', -101, 101,          ('C1',  -101, -51, -1, 49.0, 100.0)),
        ]
        if not DO_REAL:
            # only 3 integer values!
            tryList.append(\
                (1000000, 1, 'x.hex', -1, 1,              ('C1',  -1.0, -1, 0.000, 1, 1.00)) \
                )

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0))
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=60,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )

            h2o.nodes[0].remove_all_keys()
Esempio n. 45
0
    def test_parse_rand_enum_compress(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DEBUG:
            n = 20
        else:
            n = 1000000

        # from command line arg -long
        if h2o.long_test_case:
            repeat = 1000 
            scale = 10 # scale up the # of rows
            tryList = [
                (n*scale, 1, 'cI', 300), 
                (n*scale, 1, 'cI', 300), 
                (n*scale, 1, 'cI', 300), 
            ]
        else:
            repeat = 1
            scale = 1
            tryList = [
                (n, 3, 'cI', 300), 
                (n, 3, 'cI', 300), 
                (n, 3, 'cI', 300), 
            ]

        lastcolsHistory = []

        enumList = create_enum_list(listSize=ENUMS_NUM)

        for r in range(repeat):
            SEED_PER_FILE = random.randint(0, sys.maxint)
            for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                # using the comma is nice to ensure no craziness
                colSepHexString = '2c' # comma
                colSepChar = colSepHexString.decode('hex')
                colSepInt = int(colSepHexString, base=16)
                print "colSepChar:", colSepChar

                rowSepHexString = '0a' # newline
                rowSepChar = rowSepHexString.decode('hex')
                print "rowSepChar:", rowSepChar

                csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # same enum list/mapping, but different dataset?
                start = time.time()
                lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, scale=1,
                    colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE)
                elapsed = time.time() - start
                print "took %s seconds to create %s" % (elapsed, csvPathname)
                # why are we saving this?
                lastcolsHistory.append(lastcols)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0,
                    timeoutSecs=30, separator=colSepInt, doSummary=DO_SUMMARY)
                print "Parse result['destination_key']:", parseResult['destination_key']
                
                inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']

                h2o_cmd.infoFromInspect(inspect)

                # Each column should get .10 random NAs per iteration. Within 10%? 
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                # print "missingValuesList", missingValuesList
                # for mv in missingValuesList:
                #     self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, 
                #        msg='mv %s is not approx. expected %s' % (mv, expectedNA))

                self.assertEqual(rowCount, numRows)
                self.assertEqual(colCount, numCols)

                (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                    h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], 
                    exceptionOnMissingValues=DISABLE_ALL_NA)
Esempio n. 46
0
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500):  # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1, 64)  # random length headers
                headerName = ''.join(
                    [random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
        ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1

            DATA_HAS_HDR_ROW = random.randint(0, 1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0

            GZIP_DATA = random.randint(0, 1)
            GZIP_HEADER = random.randint(0, 1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','

            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k, v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",":
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)

            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0, 1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0, 1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "

            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER

            # they need to both use the same separator (h2o rule)
            # can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData = SEP_CHAR_GEN.join(hfdList)

            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(
                    trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(
                    csvPathname,
                    rowCount,
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None),
                    rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT,
                    sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(
                        csvPathname, SYNDATASETS_DIR + "/not_used_data_" +
                        csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname

            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(
                trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(
                hdrPathname,
                dataRowsWithHeader,
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None),
                rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT,
                sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER:
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(
                    hdrPathname,
                    SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file']
                                       == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] = 1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] = 1
            else:
                kwargs['header'] = 0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*'
            else:
                pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern,
                                         hex_key=hex_key,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header,
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone
                               == 0) and (kwargs['header']
                                          == 1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1

            if 1 == 0:  # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {
                'sample': 100,
                'depth': 25,
                'ntree': 2,
                'ignore': ignoreForRf
            }
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
Esempio n. 47
0
    def test_GLM2_many_enums(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # just randomly pick the row and col cases.
            colSepCase = random.randint(0, 1)
            colSepCase = 1
            # using the comma is nice to ensure no craziness
            if (colSepCase == 0):
                colSepHexString = '01'
                quoteChars = ",\'\""  # more choices for the unquoted string
            else:
                colSepHexString = '2c'  # comma
                quoteChars = ""

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0, 1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase == 0):
                rowSepHexString = '0a'  # newline
            else:
                rowSepHexString = '0d0a'  # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar,
                              quoteChars=quoteChars)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'response': y,
                'max_iter': 1,
                'n_folds': 1,
                'alpha': 0.2,
                'lambda': 1e-5
            }
            start = time.time()
            ### glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
Esempio n. 48
0
    def test_parse_utf8_3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DEBUG:
            n = 20
        else:
            n = 10000
            n = 1000
            n = 500

        # from command line arg -long
        if h2o.long_test_case:
            repeat = 1000
        else:
            repeat = 50

        scale = 1
        tryList = [
            (n, 3, 'cI', 300),
            (n, 3, 'cI', 300),
            (n, 3, 'cI', 300),
        ]

        for r in range(repeat):
            for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # using the comma is nice to ensure no craziness
                colSepHexString = '2c'  # comma
                colSepChar = colSepHexString.decode('hex')
                colSepInt = int(colSepHexString, base=16)
                print "colSepChar:", colSepChar

                rowSepHexString = '0a'  # newline
                rowSepChar = rowSepHexString.decode('hex')
                print "rowSepChar:", rowSepChar

                csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # same enum list/mapping, but different dataset?
                start = time.time()
                write_syn_dataset(csvPathname,
                                  rowCount,
                                  colCount,
                                  scale=1,
                                  colSepChar=colSepChar,
                                  rowSepChar=rowSepChar,
                                  SEED=SEEDPERFILE)
                elapsed = time.time() - start
                print "took %s seconds to create %s" % (elapsed, csvPathname)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               header=0,
                                               timeoutSecs=60,
                                               separator=colSepInt,
                                               doSummary=DO_SUMMARY)
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']

                inspect = h2o_cmd.runInspect(
                    key=parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']

                h2o_cmd.infoFromInspect(inspect)

                # Each column should get .10 random NAs per iteration. Within 10%?
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                # print "missingValuesList", missingValuesList
                # for mv in missingValuesList:
                #     self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv,
                #        msg='mv %s is not approx. expected %s' % (mv, expectedNA))

                # might have extra rows
                if numRows != rowCount:
                    raise Exception("Expect numRows %s = rowCount %s because guaranteed not to have extra eols" % \
                        (numRows, rowCount))
                # numCols should be right?
                self.assertEqual(colCount, numCols)

                (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                    h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                    exceptionOnMissingValues=False)
Esempio n. 49
0
    def test_GBM_many_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            tryList = [
                (10000, 100, 'cA', 300),
            ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300),
                # (10000, 50, 'cC', 300),
                (10000, 100, 'cD', 300),
                (10000, 200, 'cE', 300),
                (10000, 300, 'cF', 300),
                (10000, 400, 'cG', 300),
                (10000, 500, 'cH', 300),
                (10000, 1000, 'cI', 300),
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=None,
                                                path=csvPathname,
                                                schema='put',
                                                hex_key=hex_key,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            # hack

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            ntrees = 5
            prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
            # for max_depth in [5,10,20,40]:
            for max_depth in [5, 10, 20]:

                # PARSE a new header****************************************
                print "Creating new header", hdrPathname
                prefix = prefixList.pop(0)
                write_syn_header(hdrPathname, rowCount, colCount, prefix)

                # upload and parse the header to a hex

                hdr_hex_key = prefix + "_hdr.hex"
                parseHdrResult = h2i.import_parse(
                    bucket=None,
                    path=hdrPathname,
                    schema='put',
                    header=1,  # REQUIRED! otherwise will interpret as enums
                    hex_key=hdr_hex_key,
                    timeoutSecs=timeoutSecs,
                    doSummary=False)
                # Set Column Names (before autoframe is created)
                h2o.nodes[0].set_column_names(source=hex_key,
                                              copy_from=hdr_hex_key)

                # GBM
                print "response col name is changing each iteration: parsing a new header"
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': prefix + "_response",
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(
                    max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

                # works if you delete the autoframe
                ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe')

        # just plot the last one
        if DO_PLOT:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Esempio n. 50
0
    def test_GLM2_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        # H2O might not do whitespace stripping on numbers correctly, when , is {SEP}
        # GLM will auto expand categoricals..so if we have more coefficients than expected
        # that means it didn't parse right
        # mix in space/tab combos
        # just done like this for readability
        rowDataTrueRaw = \
            "<sp>1,\
            0<sp>,\
            <tab>65,\
            1<tab>,\
            <sp><tab>2,\
            1<sp><tab>,\
            <tab><sp>1,\
            4<tab><sp>,\
            <tab><tab>1,\
            4<tab><tab>,\
            <sp><sp>1,\
            4<sp><sp>"

        rowDataTrue = re.sub("<sp>"," ", rowDataTrueRaw)
        rowDataTrue = re.sub("<tab>","  ", rowDataTrue)

        rowDataFalse = \
            "0,\
            1,\
            0,\
            -1,\
            -2,\
            -1,\
            -1,\
            -4,\
            -1,\
            -4,\
            -1,\
            -3"

        twoValueList = [
            # (0,1,0, 12),
            # (0,1,1, 12),
            # ('A','B',0, 12),
            # ('A','B',1, 12),
            (-1,1,-1, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, expectedCoeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            hex_key = csvFilename + "_" + str(trial)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)

            # maybe go back to simpler exec here. this was from when Exec failed unless this was used
            execExpr="A.hex=%s" % hex_key
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (13, 13, case)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            aHack = {'destination_key': 'A.hex'}

            start = time.time()
            kwargs = {
                'n_folds': 0,
                'response': 'C13', 
                'family': 'binomial', 
                'alpha': 0.0, 
                'lambda': 0, 
                'beta_epsilon': 0.0002
            }

            # default takes 39 iterations? play with alpha/beta
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)
            glm = h2o_cmd.runGLM(parseResult=aHack, **kwargs)
            (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # check that the number of entries in coefficients is right (12 with intercept)

            coefficients_names = glm['glm_model']['coefficients_names']
            print "coefficients_names:", coefficients_names

            # subtract one for intercept
            actualCoeffNum = len(glm['glm_model']['submodels'][0]['beta']) - 1
            if (actualCoeffNum!=expectedCoeffNum):
                raise Exception("Should be %s expected coefficients in result. actual: %s" % (expectedCoeffNum, actualCoeffNum))

            print "trial #", trial, "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            trial += 1
Esempio n. 51
0
    def test_NN_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4"

        twoValueList = [
            ('A', 'B', 0, 14),
            ('A', 'B', 1, 14),
            (0, 1, 0, 12),
            (0, 1, 1, 12),
            (0, 1, 'NaN', 12),
            (1, 0, 'NaN', 12),
            (-1, 1, 0, 12),
            (-1, 1, 1, 12),
            (-1e1, 1e1, 1e1, 12),
            (-1e1, 1e1, -1e1, 12),
        ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse,
                              str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue,
                                                            outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols': None,
                'response': 'C' + str(response),
                'classification': 1,
                'mode': 'SingleThread',
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden': '500',
                'rate': 0.01,
                'rate_annealing': 1e-6,
                'momentum_start': 0,
                'momentum_ramp': 0,
                'momentum_stable': 0,
                'l1': 0.0,
                'l2': 1e-4,
                'seed': 80023842348,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 1.0,
                'destination_key': model_key,
                'validation': hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o.beta_features = True
            h2o_cmd.runNNet(parseResult=parseResult,
                            timeoutSecs=timeoutSecs,
                            **kwargs)
            print "trial #", trial, "NN end on ", csvFilename, ' took', time.time(
            ) - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.0
            relTol = 0.01
            kwargs = {
                'source': hex_key,
                'max_rows': 0,
                'response': 'C' + str(response),
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score' + str(trial) + '.hex',
                'model': model_key
            }

            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

            h2o.check_sandbox_for_errors()

            trial += 1
Esempio n. 52
0
 def setUpClass(cls):
     global SEED
     SEED = h2o.setup_random_seed()
     h2o.init()
     global SYNDATASETS_DIR
     SYNDATASETS_DIR = h2o.make_syn_dir()
Esempio n. 53
0
    def test_GLM_enums_unbalanced(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 10,
                'alpha': 0,
                'lambda': 0,
                'thresholds': 0.5,
                # 'case_mode': '=',
                # 'case': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-8
                },
                {
                    'alpha': 0.5,
                    'lambda': 0.0
                },
                {
                    'alpha': 0.0,
                    'lambda': 0.0
                },
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                GLMModel = glm['GLMModel']
                # submodels0 = GLMModel['submodels'][0]
                iterations = GLMModel['iterations']
                modelKey = GLMModel['model_key']

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iterations > 20:
                    raise Exception(
                        "Why take so many iterations:  %s in this glm training?"
                        % iterations)

                parseResult = h2i.import_parse(path=csvScorePathname,
                                               schema='put',
                                               hex_key="score_" + hex_key,
                                               timeoutSecs=30,
                                               separator=colSepInt)

                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(
                    key=parseResult['destination_key'],
                    model_key=modelKey,
                    thresholds="0.5",
                    timeoutSecs=timeoutSecs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'
                ### print h2o.dump_json(glmScore)
                classErr = glmScore['validation']['classErr']
                auc = glmScore['validation']['auc']
                err = glmScore['validation']['err']
                nullDev = glmScore['validation']['nullDev']
                resDev = glmScore['validation']['resDev']
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "classErr:", classErr
                print "err:", err
                print "auc:", auc
                print "resDev:", resDev
                print "nullDev:", nullDev
                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                        "resDev:\t", validation['resDev'])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                self.assertAlmostEqual(
                    auc,
                    0.5,
                    delta=0.15,
                    msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                        "resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % (
                        "nullDev:\t", nullDev)
    def test_GLM_many_cols_tridist(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000, 10, 'cA', 300),
            (10000, 20, 'cB', 300),
            (10000, 30, 'cC', 300),
            (10000, 40, 'cD', 300),
            (10000, 50, 'cE', 300),
            (10000, 60, 'cF', 300),
            (10000, 70, 'cG', 300),
            (10000, 80, 'cH', 300),
            (10000, 90, 'cI', 300),
            (10000, 100, 'cJ', 300),
            (10000, 200, 'cK', 300),
            (10000, 300, 'cL', 300),
            (10000, 400, 'cM', 300),
            (10000, 500, 'cN', 300),
            (10000, 600, 'cO', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "\nParse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]

            y = colCount
            kwargs = {'y': y}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
Esempio n. 55
0
    def test_GLM_enums_unbalanced(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey,
                timeoutSecs=30, separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y+1), 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 0, 
                'alpha': 0, 
                'lambda': 0, 
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.5, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 0},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length

                print 'beta', beta
                print 'iteration', iteration
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 30:
                    raise Exception("Why take so many iterations:  %s in this glm2 training?" % iteration)

               # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
Esempio n. 56
0
    def test_GLM_enums_unbalanced(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=testDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y + 1),
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 0,
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.5,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 0
                },
            ]

            # Try each one
            h2o.beta_features = True
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     noPoll=True,
                                     **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300,
                                 pollTimeoutSecs=300,
                                 retryDelaySecs=5,
                                 errorIfCancelled=True)
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)
                print "glm2 end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                if not validation or 'avg_err' not in validation:
                    raise Exception("glm: %s" % h2o.dump_json(glm) + \
                        "\nNo avg_err in validation." + \
                        "\nLikely if you look back, the job was cancelled, so there's no cross validation.")

                avg_err = validation['avg_err']
                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
                print 'beta', beta
                print 'iteration', iteration
                print 'avg_err', avg_err
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception(
                        "Why take so many iterations:  %s in this glm2 training?"
                        % iterations)

            # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key=testDataKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=testDataKey,
                    vactual='C' + str(y),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(
                    pctWrong, 8,
                    "Should see less than 7 pct error (class = 4): %s" %
                    pctWrong)

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                if 1 == 0:
                    # stuff from GLM1

                    classErr = glmScore['validation']['classErr']
                    auc = glmScore['validation']['auc']
                    err = glmScore['validation']['err']
                    nullDev = glmScore['validation']['nullDev']
                    resDev = glmScore['validation']['resDev']
                    h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                    print "score classErr:", classErr
                    print "score err:", err
                    print "score auc:", auc
                    print "score resDev:", resDev
                    print "score nullDev:", nullDev

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                            "resDev:\t", validation['resDev'])
                        raise Exception(emsg)

                    # what is reasonable?
                    # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                    self.assertAlmostEqual(
                        auc,
                        0.5,
                        delta=0.15,
                        msg="actual auc: %s not close enough to 0.5" % auc)

                    if math.isnan(err):
                        emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t",
                                                                     err)
                        raise Exception(emsg)

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                            "resDev:\t", resDev)
                        raise Exception(emsg)

                    if math.isnan(nullDev):
                        emsg = "Why is this nullDev = 'nan'?? %6s %s" % (
                            "nullDev:\t", nullDev)
Esempio n. 57
0
    def test_summary2_percentile(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 1, 'cD', 300),
            (100000, 2, 'cE', 300),
        ]

        timeoutSecs = 10
        trial = 1
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print 'Trial:', trial
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            legalValues = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} # set. http://docs.python.org/2/library/stdtypes.html#set
            expectedMin = min(legalValues)
            expectedMax = max(legalValues)
            expectedUnique = (expectedMax - expectedMin) + 1
            mode = 0.5 # rounding to nearest int will shift us from this for expected mean
            expectedMean = 0.5
            expectedSigma = 0.5
            write_syn_dataset(csvPathname, rowCount, colCount, 
                low=expectedMin, high=expectedMax, mode=mode,
                SEED=SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename('.', csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            if h2o.verbose:
                print "summaryResult:", h2o.dump_json(summaryResult)

            summaries = summaryResult['summaries']
            scipyCol = 0
            for column in summaries:
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hbrk:
                    self.assertIn(int(b), legalValues)
                self.assertEqual(len(hbrk), len(legalValues))

                # self.assertAlmostEqual(hcnt[0], 0.5 * rowCount, delta=.01*rowCount)
                # self.assertAlmostEqual(hcnt[1], 0.5 * rowCount, delta=.01*rowCount)

                print "pctile:", pctile
                print "maxs:", maxs
                # we round to int, so we may introduce up to 0.5 rounding error? compared to "mode" target
                self.assertAlmostEqual(maxs[0], expectedMax, delta=0.01)
                print "mins:", mins
                self.assertAlmostEqual(mins[0], expectedMin, delta=0.01)

                for v in pctile:
                    self.assertTrue(v >= expectedMin,
                        "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin))
                    self.assertTrue(v <= expectedMax,
                        "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax))

                eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0]
                if expectedMin==1:
                    eV = eV1
                elif expectedMin==0:
                    eV = [e-1 for e in eV1]
                elif expectedMin==2:
                    eV = [e+1 for e in eV1]
                else:
                    raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin)

                if colname!='':
                    # don't do for enums
                    # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=scipyCol,
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        # h2oQuantilesExact=qresult,
                        )

                scipyCol += 1
Esempio n. 58
0
    def test_GLM_convergence_1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 50,  'cD', 300),
            (100, 100, 'cE', 300),
            (100, 200, 'cF', 300),
            (100, 300, 'cG', 300),
            (100, 400, 'cH', 300),
            (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        USEKNOWNFAILURE = True
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            if USEKNOWNFAILURE:
                csvFilename = 'failtoconverge_100x50.csv'
                csvPathname = h2o.find_file('smalldata/logreg/' + csvFilename)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                    'max_iter': 10, 
                    'weight': 1.0,
                    'link': 'familyDefault',
                    'n_folds': 2,
                    'beta_epsilon': 1e-4,
                    'lambda': '1e-8:1e-3:1e2',
                    'alpha': '0,0.5,.75',
                    'thresholds': '0,1,0.2'
                    }

            if USEKNOWNFAILURE:
                kwargs['y'] = 50
            else:
                kwargs['y'] = y

            emsg = None
            for i in range(2):
                start = time.time()
                # get rid of the Jstack polling
                glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can 
                # redo it in the browser for comparison
                warnings = h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs)

                # gets the failed to converge, here, after we see it in the browser too
                x = re.compile("[Ff]ailed")
                if warnings:
                    for w in warnings:
                        if (re.search(x,w)): 
                            # first
                            if emsg is None: emsg = w
                            print w
                if emsg: break
        
            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLMGridProgress")
                time.sleep(5)

            # gets the failed to converge, here, after we see it in the browser too
            if emsg is not None:
                raise Exception(emsg)
    def test_parse_bounds_csv_fvec(self):
        print "Random 0/1 for col1. Last has max col = 1, All have zeros for class."
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 50, 'cC', 300),
            (1000, 999, 'cC', 300),
            (1000, 1000, 'cA', 300),
            # (1000, 100000, 'cB', 300),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount,
                                                colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            # dict of col sums for comparison to exec col sums below
            synSumList = write_syn_dataset(csvPathname, rowCount, colCount,
                                           SEEDPERFILE)

            # PARSE**********************
            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           schema='put',
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT*******************
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs)
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            iCols = inspect['cols']
            iStats = []
            for stats in iCols:
                iName = stats['name']
                # just touching to make sure they are there
                iNaCnt = stats['naCnt']
                iMin = float(stats['min'])
                iMax = float(stats['max'])
                iMean = float(stats['mean'])
                iStats.append({
                    'name': iName,
                    'naCnt': iNaCnt,
                    'min': iMin,
                    'max': iMax,
                    'mean': iMean,
                })

            # SUMMARY********************************
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_ncols=colCount,
                                               timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(rowCount,
                             numRows,
                             msg="generated %s rows, parsed to %s rows" %
                             (rowCount, numRows))

            columnsList = summaryResult['summaries']
            self.assertEqual(
                colCount,
                len(columnsList),
                msg=
                "generated %s cols (including output).  summary has %s columns"
                % (colCount, len(columnsList)))

            c = 0
            for column in columnsList:
                # get info from the inspect col for comparison
                iMin = iStats[c]['min']
                iMax = iStats[c]['max']
                iMean = iStats[c]['mean']
                iNaCnt = iStats[c]['naCnt']
                c += 1

                colname = column['colname']
                stats = column['stats']
                stype = column['type']
                hstep = column['hstep']
                hbrk = column['hstep']
                hstart = column['hstart']

                smax = stats['maxs']
                smin = stats['mins']
                sd = stats['sd']
                smean = stats['mean']
                # no zeroes if enum, but we're not enum here
                zeros = stats['zeros']

                self.assertEqual(
                    iMin, smin[0],
                    "inspect min %s != summary min %s" % (iMin, smin))
                self.assertEqual(
                    iMax, smax[0],
                    "inspect max %s != summary max %s" % (iMax, smax))
                self.assertEqual(
                    iMean, smean,
                    "inspect mean %s != summary mean %s" % (iMean, smean))
                # no comparison for 'zeros'

                # now, also compare expected values
                if colname == "V1":
                    synNa = 0
                    # can reverse-engineer the # of zeroes, since data is always 1
                    synSum = synSumList[
                        1]  # could get the same sum for all ccols
                    synZeros = numRows - synSum
                    synSigma = 0.50
                    synMean = (synSum + 0.0) / numRows
                    synMin = [0.0, 1.0]
                    synMax = [1.0, 0.0]

                elif colname == "V2":
                    synSum = 0
                    synSigma = 0
                    synMean = 0
                    if DO_NAN:
                        synZeros = 0
                        synNa = numRows
                        synMin = []
                        synMax = []
                    else:
                        synZeros = numRows
                        synNa = 0
                        synMin = [0.0]
                        synMax = [0.0]

                # a single 1 in the last col
                elif colname == "V" + str(colCount -
                                          1):  # h2o puts a "V" prefix
                    synNa = 0
                    synSum = synSumList[colCount - 1]
                    synZeros = numRows - 1
                    # stddev.p
                    # http://office.microsoft.com/en-us/excel-help/stdev-p-function-HP010335772.aspx

                    synMean = 1.0 / numRows  # why does this need to be a 1 entry list
                    synSigma = math.sqrt(pow((synMean - 1), 2) / numRows)
                    print "last col with single 1. synSigma:", synSigma
                    synMin = [0.0, 1.0]
                    synMax = [1.0, 0.0]

                else:
                    synNa = 0
                    synSum = 0
                    synZeros = numRows
                    synSigma = 0.0
                    synMean = 0.0
                    synMin = [0.0]
                    synMax = [0.0]

                if DO_MEAN:
                    self.assertAlmostEqual(
                        float(smean),
                        synMean,
                        places=6,
                        msg='col %s mean %s is not equal to generated mean %s'
                        % (colname, smean, synMean))

                # why are min/max one-entry lists in summary result. Oh..it puts N min, N max
                self.assertTrue(
                    smin >= synMin,
                    msg='col %s min %s is not >= generated min %s' %
                    (colname, smin, synMin))

                self.assertTrue(
                    smax <= synMax,
                    msg='col %s max %s is not <= generated max %s' %
                    (colname, smax, synMax))

                # reverse engineered the number of zeroes, knowing data was always 1 if present?
                if colname == "V65536" or colname == "V65537":
                    print "columns around possible zeros mismatch:", h2o.dump_json(
                        columns)

                self.assertEqual(
                    zeros,
                    synZeros,
                    msg='col %s zeros %s is not equal to generated zeros %s' %
                    (colname, zeros, synZeros))
Esempio n. 60
0
    def test_rf_predict3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 1:
            y = 4  # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            translate = None
            expectedPctWrong = 0.0

        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        else:
            y = 0  # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4,\
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey + "=" + hex_key,
                          timeoutSecs=30)  # unneeded but interesting
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) +
                          "]",
                          timeoutSecs=30)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=model_key,
                data_key=hexKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time(
            ) - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex",
                                      csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey,
                                      csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(
                csvSrcOutputPathname,
                msg="Original",
                colIndex=0,
                translate=translate,
                skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput) = compare_csv_at_one_col(
                csvPredictPathname,
                msg="Predicted",
                colIndex=0,
                skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1 - skipSrcOutputHeader) !=
                (rowNum2 - skipPredictHeader)):
                raise Exception(
                    "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" %
                    (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput,
                                                predictOutput)):
                # if float(o)!=float(p):
                if str(o) != str(p):
                    if wrong == 10:
                        print "Not printing any more mismatches\n"
                    elif wrong < 10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s" % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 2.0:
                raise Exception(
                    "pctWrong too high. Expect < 2% error because it's reusing training data"
                )
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        kwargs = {
            'destination_key': 'rf_model',
            'response': response,
            'ntrees': trees,
            'classification': 1,
        }

        rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y
        pctWrong = predict_and_compare_csvs(model_key='rf_model',
                                            hex_key=hexKey,
                                            translate=translate,
                                            y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        self.assertAlmostEqual(
            pctWrong,
            expectedPctWrong,
            delta=0.2,
            msg=
            "predicted pctWrong: %s should be small because we're predicting with training data"
            % pctWrong)