def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        bucket = 'datasets'
        csvPathname = 'logreg' + '/' + csvFilename
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x)
        glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
    def test_GLM_hastie_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard' + '/' + csvFilename
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x)
        glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
Beispiel #3
0
    def test_GLM2_score_same(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename

        y = "10"
        kwargs = {'response':  y, 'alpha': 0, 'family': 'gaussian'}
        (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs) 
        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x

        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename2x
        bucket = None
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
    def test_KMeans_hastie_shuffle_fvec(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard/' + csvFilename
        bucket = 'home-0xdiag-datasets'
        kmeans_doit(self, csvFilename, bucket, csvPathname, numRows=1000000, timeoutSecs=60)
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        kmeans_doit(self, filename2xShuf, None, pathname2xShuf, numRows=2000000, timeoutSecs=90)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        kmeans_doit(self, filename4x, None, pathname4x, numRows=4000000, timeoutSecs=120)
Beispiel #5
0
    def test_GLM2_score_same(self):
        h2o.beta_features = True
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename

        y = "10"
        kwargs = {'response':  y, 'alpha': 0, 'family': 'gaussian'}
        (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs) 
        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x

        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename2x
        bucket = None
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
Beispiel #6
0
    def test_GLM_hastie(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75)
        fullPathname = h2i.find_folder_and_filename(bucket,
                                                    csvPathname,
                                                    returnFullPath=True)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)
        glm_doit(self, filename2x, None, pathname2x, timeoutSecs=75)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)

        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
Beispiel #7
0
    def test_GLM_hastie(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75)
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, None, pathname2x, timeoutSecs=75)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
Beispiel #8
0
    def test_B_randomdata2_1_lineend(self):
        csvPathname = 'datagen1.csv'
        # change lineend, case 1
        csvPathname1 = h2i.find_folder_and_filename('smalldata',
                                                    csvPathname,
                                                    returnFullPath=True)
        print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending"
        csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv'

        infile = open(csvPathname1, 'r')
        outfile = open(csvPathname2, 'w')  # existing file gets erased

        # assume all the test files are unix lineend.
        # I guess there shouldn't be any "in-between" ones
        # okay if they change I guess.
        for line in infile.readlines():
            outfile.write(line.strip("\n") + "\r")
        infile.close()
        outfile.close()

        parseResult = h2i.import_parse(path=csvPathname2,
                                       schema='put',
                                       timeoutSecs=10,
                                       header=0,
                                       separator=44)

        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        numCols = inspect['numCols']
        h2o_cmd.runRF(parseResult=parseResult,
                      trees=1,
                      response='C' + str(numCols),
                      timeoutSecs=20)
    def test_B_randomdata2_1_lineend(self):
        csvPathname = 'datagen1.csv'
        # change lineend, case 1
        csvPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
        print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" 
        csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv'

        infile = open(csvPathname1, 'r') 
        outfile = open(csvPathname2,'w') # existing file gets erased

        # assume all the test files are unix lineend. 
        # I guess there shouldn't be any "in-between" ones
        # okay if they change I guess.
        for line in infile.readlines():
            outfile.write(line.strip("\n") + "\r")
        infile.close()
        outfile.close()

        parseResult = h2i.import_parse(path=csvPathname2, schema='put', 
            timeoutSecs=10, header=0, separator=44)

        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        numCols = inspect['numCols']
        h2o_cmd.runRF(parseResult=parseResult, 
            trees=1, 
            response_variable='C'+str(numCols),
            timeoutSecs=20)
Beispiel #10
0
    def test_GLM2_princeton(self):
        # filename, y, timeoutSecs
        # these are all counts? using gaussian?
        csvFilenameList = [
            ('cuse.dat', 'gaussian', 3, 10), # notUsing
            ('cuse.dat', 'gaussian', 4, 10), # using
            ('copen.dat', 'gaussian', 4, 10),
            ('housing.raw', 'gaussian', 4, 10),
            ]

        trial = 0
        for (csvFilename, family, y, timeoutSecs) in csvFilenameList:
            csvPathname1 = 'logreg/princeton/' + csvFilename
            fullPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True)
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_stripped.csv'
            h2o_util.file_strip_trailing_spaces(fullPathname1, csvPathname2)

            parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=timeoutSecs)
            start = time.time()
            kwargs = {'n_folds': 0, 'family': family, 'response': y}
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end (w/check) on ", csvPathname2, 'took', time.time() - start, 'seconds'
            trial += 1
            print "\nTrial #", trial
Beispiel #11
0
def file_to_put():
    # kbn fails 10/15/12
    # return 'smalldata/poker/poker-hand-testing.data'
    a = h2i.find_folder_and_filename('smalldata',
                                     'poker/poker1000',
                                     schema='put',
                                     returnFullPath=True)
    print "\nfind_folder_and_filename:", a
    return a
Beispiel #12
0
 def test_A_inspect_poker1000(self):
     csvPathname = "poker/poker1000"
     res = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     ary  = h2o_cmd.runInspect(key=res['destination_key'])
     # count lines in input file - there is no header for poker 1000
     fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
     rows = wcl(fullPathname)
     self.assertEqual(rows, ary['numRows'])
     self.assertEqual(11, ary['numCols'])
Beispiel #13
0
    def test_exec2_xorsum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 1, 'r1', 0, 10, None),
        ]

        ullResultList = []
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # dynamic range of the data may be useful for estimating error
            maxDelta = expectedMax - expectedMin

            csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            print "Creating random", csvPathname
            (expectedUll, expectedFpSum)  = write_syn_dataset(csvPathname, 
                rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, 
                timeoutSecs=3000, retryDelaySecs=2)
            inspect = h2o_cmd.runInspect(key=hex_key)
            print "numRows:", inspect['numRows']
            print "numCols:", inspect['numCols']
            inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
            print "inspect offset = -1:", h2o.dump_json(inspect)

            
            # looking at the 8 bytes of bits for the h2o doubles
            # xorsum will zero out the sign and exponent
            for execExpr in exprList:
                start = time.time()
                (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, 
                    resultKey=None, timeoutSecs=300)
                print 'exec took', time.time() - start, 'seconds'
                print "execResult:", h2o.dump_json(execResult)
                print ""
                print "%30s" % "fpResult:", "%.15f" % fpResult
                ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                print "%30s" % "bitResult (0.16x):", "0x%0.16x" % ullResult
                print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll
                # print "%30s" % "hex(bitResult):", hex(ullResult)
                ullResultList.append((ullResult, fpResult))

            h2o.check_sandbox_for_errors()

            print "first result was from a sum. others are xorsum"
            print "ullResultList:"
            for ullResult, fpResult in ullResultList:
                print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
            expectedUllAsDouble = h2o_util.unsignedLongLongToDouble(expectedUll)
            print "%30s" % "expectedUll (0.16x):", "0x%0.16x   %s" % (expectedUll, expectedUllAsDouble)
            expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)
            print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
Beispiel #14
0
 def test_A_inspect_poker1000(self):
     h2o.beta_features = True
     csvPathname = "poker/poker1000"
     res = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
     ary  = h2o_cmd.runInspect(key=res['destination_key'])
     # count lines in input file - there is no header for poker 1000
     fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
     rows = wcl(fullPathname)
     self.assertEqual(rows, ary['numRows'])
     self.assertEqual(11, ary['numCols'])
Beispiel #15
0
    def test_nulls_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # we're going to insert <NUL> (0x0) in between every byte!
        # and then use it. move to a large file. I suppose
        # we could compare the results to a non-munged file with the same algo
        # I suppose the <NUL> are thrown away by parse, so doesn't change
        # chunk boundary stuff. (i.e. not interesting test for RF)
        csvFilename = 'poker1000'
        csvPathname = 'poker/' + csvFilename
        fullPathname = h2i.find_folder_and_filename('smalldata',
                                                    csvPathname,
                                                    returnFullPath=True)

        nulFilename = "syn_nul.data"
        nulPathname = SYNDATASETS_DIR + '/' + nulFilename

        piece_size = 4096  # 4 KiB

        with open(fullPathname, "rb") as in_file:
            with open(nulPathname, "wb") as out_file:
                while True:
                    piece = in_file.read(103)
                    if piece == "":
                        break  # end of file

                    # we could just extend piece?
                    # start with a null
                    withNuls = bytearray(piece)
                    # FIX! we'll eventually stick a <NUL> after every byte!
                    withNuls.extend(bytearray.fromhex('00'))
                    out_file.write(withNuls)

        for trials in xrange(1, 2):
            trees = 6
            for x in xrange(161, 240, 40):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                timeoutSecs = 20 + 5 * (len(h2o.nodes))
                model_key = csvFilename + "_" + str(trials)

                parseResult = h2i.import_parse(path=nulPathname, schema='put')
                h2o_cmd.runRF(parseResult=parseResult,
                              trees=trees,
                              destination_key=model_key,
                              timeoutSecs=timeoutSecs,
                              retryDelaySecs=1)
                sys.stdout.write('.')
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
Beispiel #16
0
    def test_exec2_sum(self):
        h2o.beta_features = True
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['Key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(
                lenNodes,
                exprList,
                hex_key,
                maxCol=54,
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Beispiel #17
0
    def test_file_with_nul_chars_inserted(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # we're going to insert <NUL> (0x0) in between every byte!
        # and then use it. move to a large file. I suppose
        # we could compare the results to a non-munged file with the same algo
        # I suppose the <NUL> are thrown away by parse, so doesn't change
        # chunk boundary stuff. (i.e. not interesting test for RF)
        csvFilename = "poker1000"
        csvPathname = "poker/" + csvFilename
        fullPathname = h2i.find_folder_and_filename("smalldata", csvPathname, returnFullPath=True)

        nulFilename = "syn_nul.data"
        nulPathname = SYNDATASETS_DIR + "/" + nulFilename

        piece_size = 4096  # 4 KiB

        with open(fullPathname, "rb") as in_file:
            with open(nulPathname, "wb") as out_file:
                while True:
                    piece = in_file.read(103)
                    if piece == "":
                        break  # end of file

                    # we could just extend piece?
                    # start with a null
                    withNuls = bytearray(piece)
                    # FIX! we'll eventually stick a <NUL> after every byte!
                    withNuls.extend(bytearray.fromhex("00"))
                    out_file.write(withNuls)

        for trials in xrange(1, 2):
            trees = 6
            for x in xrange(161, 240, 40):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                timeoutSecs = 20 + 5 * (len(h2o.nodes))
                model_key = csvFilename + "_" + str(trials)

                parseResult = h2i.import_parse(path=nulPathname, schema="put")
                h2o_cmd.runRF(
                    parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1
                )
                sys.stdout.write(".")
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
Beispiel #18
0
    def test_exec2_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            hex_key=hex_key)
            pA = h2o_cmd.ParseObj(parseResultA)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            iA = h2o_cmd.InspectObj(pA.parse_key)

            k = Key(hex_key)
            colResultList = []
            for i in range(pA.numCols):
                result = Expr(Fcn('sum', k[:, i], True)).result
                colResultList.append(result)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Beispiel #19
0
    def test_GLM2grid_hastie(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = "home-0xdiag-datasets"
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = "standard" + "/" + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300)

        fullPathname = h2i.find_folder_and_filename("home-0xdiag-datasets", csvPathname, returnFullPath=True)
        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + "/" + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + "/" + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)
        glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
Beispiel #20
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300)

        fullPathname = h2i.find_folder_and_filename('home-0xdiag-datasets', csvPathname, returnFullPath=True)
        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
    def test_exec2_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5,  1),
            (pathname2x, "cB", 5,  2),
            (pathname2x, "cC", 5,  2),
        ]

        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            pA = h2o_cmd.ParseObj(parseResultA)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            iA = h2o_cmd.InspectObj(pA.parse_key)

            k = Key(hex_key)
            colResultList = []
            for i in range(pA.numCols):
                result = Expr(Fcn('sum', k[:,i], True)).result
                colResultList.append(result)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
Beispiel #22
0
    def test_exec2_sum(self):
        h2o.beta_features = True
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('datasets', 'UCI/UCI-large/covtype/covtype.data', returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5,  1),
            (pathname2x, "cB", 5,  2),
            (pathname2x, "cC", 5,  2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['Key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, hex_key, maxCol=54, 
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
    def test_GLM_score_same(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        bucket = 'datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'logreg' + '/' + csvFilename

        y = "10"
        x = ""
        kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}
        (modelKey, validations1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs)

        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x

        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        bucket = None
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, bucket, pathname2x, modelKey, thresholds="0.5",
            timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, bucket, pathname4x, modelKey, thresholds="0.5",
                timeoutSecs=60, pollTimeoutSecs=60)
Beispiel #24
0
    def test_summary2_small(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 100, 'x.hex', [-1, 0,
                                  1], ('C1', None, None, 0, None, None)),
            (None, 1000, 'x.hex', [-1, 0,
                                   1], ('C1', None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values,
                              SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS,
                                               timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=0,
                                       interpolation_type=7,
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg=
                "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?"
            )

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       numRows / len(hcnt),
                                       delta=1 + .01 * numRows,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Beispiel #25
0
    def test_GLM_both(self):
        if (1==1):
            csvFilenameList = [
                ('logreg', 'benign.csv', 'binomial', 3, 10),
                # col is zero based
                # FIX! what's wrong here? index error
                ## ('uis.dat', 'binomial', 8, 5, False),
                ## ('pros.dat', 'binomial', 1, 10, False),
                ## ('chdage.dat', 'binomial', 2, 5, True),
                ## ('icu.dat', 'binomial', 1, 10, False),
                # how to ignore 6? '1,2,3,4,5', False),
                ## ('clslowbwt.dat', 'binomial', 7, 10, False),
                # ('cgd.dat', 'gaussian', 12, 5, False),
                # ('meexp.dat', 'gaussian', 3, 10, None),
            ]
        else:
            csvFilenameList = [
                # leave out ID and birth weight
                ('logreg', 'benign.csv', 'gaussian', 3, 10),
                (None, 'icu.dat', 'binomial', 1, 10),
                # need to exclude col 0 (ID) and col 10 (bwt)
                # but -x doesn't work..so do 2:9...range doesn't work? FIX!
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'lowbwt.dat', 'binomial', 1, 10),
                (None, 'lowbwtm11.dat', 'binomial', 1, 10),
                (None, 'meexp.dat', 'gaussian', 3, 10),
                # FIX! does this one hang in R?
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'pbc.dat', 'gaussian', 1, 10),
                (None, 'pharynx.dat', 'gaussian', 12, 10),
                (None, 'uis.dat', 'binomial', 8, 10),
            ]

        trial = 0
        for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList:

            # FIX! do something about this file munging
            if offset:
                csvPathname1 = offset + "/" + csvFilename
            else:
                csvPathname1 = 'logreg/umass_statdata/' + csvFilename

            fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True)

            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv'
            h2o_util.file_clean_for_R(fullPathname, csvPathname2)

            # we can inspect this to get the number of cols in the dataset (trust H2O here)
            parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10)
            # we could specify key2 above but this is fine
            destination_key = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, destination_key)

            num_cols = inspect['numCols']
            num_rows = inspect['numRows']
            print "num_cols", num_cols, "num_rows", num_rows
            ##  print h2o.dump_json(inspect)

            # create formula and the x for H2O GLM
            formula = "V" + str(y+1) + " ~ "
            x = None
            col_names = ""
            for c in range(0,num_cols):
                if csvFilename=='clslowbwt.dat' and c==6:
                    print "Not including col 6 for this dataset from x"
                if csvFilename=='benign.csv' and (c==0 or c==1):
                    print "Not including col 0,1 for this dataset from x"
                else:
                    # don't add the output col to the RHS of formula
                    if x is None: 
                        col_names += "V" + str(c+1)
                    else: 
                        col_names += ",V" + str(c+1)

                    if c!=y:
                        if x is None: 
                            x = str(c)
                            formula += "V" + str(c+1)
                        else: 
                            x += "," + str(c)
                            formula += "+V" + str(c+1)

            print 'formula:', formula
            print 'col_names:', col_names

        
            print 'x:', x

            kwargs = { 
                'n_folds': 0, 
                'response': y, 
                # what about x?
                'family': family, 
                'alpha': 0, 
                'lambda': 0,
                'beta_epsilon': 1.0E-4, 
                'max_iter': 50 }

            if csvFilename=='benign.csv':
                kwargs['ignored_cols'] = '0,1'

            if csvFilename=='clslowbwt.dat':
                kwargs['ignored_cols'] = '6'

            
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds'
            h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs)
            # now do it thru R and compare
            (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults)

            trial += 1
            print "\nTrial #", trial
Beispiel #26
0
    def test_exec2_xorsum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
                assert parse_key == hex_key
                assert numCols == colCount
                assert numRows == rowCount

                inspect = h2o_cmd.runInspect(key=hex_key)
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert len(missingList) == 0

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
                        start = time.time()
                        execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30)
                        fpResult = execResult['scalar']
                        # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300)
                        print r, 'exec took', time.time() - start, 'seconds'
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()

                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult!=expectedUllSum:
                            raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0,
                                            15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                            -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                   1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                         1.00)),
            (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                          100.0)),
            (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                            7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                             100, 00)),
            (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                             75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18,
                                             49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999
            ]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount,
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i != 0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (
                        hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" %
                                    h2o.dump_json(resultExec))
                    h2p.blue_print(
                        "\nthreshold: %.2f Exec quantile: %s Summary2: %s" %
                        (threshold, result, pt[i]))
                    if not result:
                        raise Exception(
                            "exec result: %s for quantile: %s is bad" %
                            (result, threshold))
                    h2o_util.assertApproxEqual(
                        result,
                        pctile[i],
                        tol=maxDelta,
                        msg=
                        'exec percentile: %s too different from expected: %s' %
                        (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1 == 0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2')
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols, 1)
                    self.assertEqual(numRows, len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                )

            h2o.nodes[0].remove_all_keys()
Beispiel #28
0
    def test_exec2_xorsum2(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(20):
            ullResultList = []
            NUM_FORMAT_CASES = h2o_util.fp_format()
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname

                sel = random.randint(0, NUM_FORMAT_CASES-1)
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                
                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    start = time.time()
                    (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, 
                        resultKey=None, timeoutSecs=300)
                    print 'exec took', time.time() - start, 'seconds'
                    print "execResult:", h2o.dump_json(execResult)
                    ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                    ullResultList.append((ullResult, fpResult))

                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                    print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                    print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                    # allow diff of the lsb..either way. needed when integers are parsed

                    # okay for a couple of lsbs to be wrong, due to conversion from stringk
                    # ullResult (0.16x): 0x02c1a21f923cee96   2.15698793923e-295
                    # expectedUllSum (0.16x): 0x02c1a21f923cee97   2.15698793923e-295
                    # expectedFpSum (0.16x): 0x42f054af32b3c408   2.87294442126e+14

                    # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them.
                    # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues
                    ALLOWED_BIT_ERR = 0x1f # seeing this amount of error!
                    if ullResult!=expectedUllSum and (abs(ullResult-expectedUllSum)>ALLOWED_BIT_ERR):
                        raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum))
                        print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)

                    # print "%30s" % "hex(bitResult):", hex(ullResult)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
Beispiel #29
0
    def test_GLM1_GLM2_predict(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        trees = 15
        timeoutSecs = 120
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            y = 54
            expectedPctWrong = 0

        if 1 == 0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            y = 54
            expectedPctWrong = 0

        if 1 == 1:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'smalldata'
            # no header
            csvPathname = 'iris/iris.csv'
            hexKey = 'iris.hex'
            y = 4
            expectedPctWrong = 26

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        h2o_cmd.runSummary(key=hexKey)

        # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
        trainKey = parseResult['destination_key']

        # just to check. are there any NA/constant cols?
        ignore_x = h2o_glm.goodXFromColumnInfo(
            y, key=parseResult['destination_key'], timeoutSecs=300)

        #**************************************************************************
        # first glm1
        CLASS = 1
        # try ignoring the constant col to see if it makes a diff
        kwargs = {
            'lsm_solver': LSM_SOLVER,
            'standardize': STANDARDIZE,
            'y': 'C' + str(y + 1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON,
            'case': CLASS,
            'case_mode': '=',
        }

        timeoutSecs = 120
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult,
                             timeoutSecs=timeoutSecs,
                             **kwargs)
        # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right
        glm['GLMModel']['GLMParams']['family'] = FAMILY
        print "glm1 end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        (warnings, coefficients1,
         intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
        iterations1 = glm['GLMModel']['iterations']
        err1 = glm['GLMModel']['validations'][0]['err']
        nullDev1 = glm['GLMModel']['validations'][0]['nullDev']
        resDev1 = glm['GLMModel']['validations'][0]['resDev']

        if FAMILY == 'binomial':
            classErr1 = glm['GLMModel']['validations'][0]['classErr']
            auc1 = glm['GLMModel']['validations'][0]['auc']

        #**************************************************************************
        # then glm2
        kwargs = {
            # 'ignored_cols': 'C29',
            'standardize': STANDARDIZE,
            'response': 'C' + str(y + 1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON
        }

        timeoutSecs = 120

        # class 1=1, all else 0
        if FAMILY == 'binomial':
            execExpr = "B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % (
                trainKey, y + 1, trainKey, y + 1, CLASS)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            bHack = {'destination_key': 'B.hex'}
        else:
            bHack = parseResult
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})

        #        kwargs.update({'alpha': 0.0, 'lambda': 0})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # bad model (auc=0.5)
        # kwargs.update({'alpha': 0.0, 'lambda': 0.0})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=bHack,
                             timeoutSecs=timeoutSecs,
                             **kwargs)
        print "glm2 end on ", csvPathname, 'took', time.time(
        ) - start, 'seconds'
        (warnings, coefficients,
         intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        #**************************************************************************
        modelKey = glm['glm_model']['_key']
        submodels = glm['glm_model']['submodels']
        # hackery to make it work when there's just one
        validation = submodels[-1]['validation']
        iteration = submodels[-1]['iteration']

        resDev = validation['residual_deviance']
        nullDev = validation['null_deviance']
        if FAMILY == 'binomial':
            auc = validation['auc']

        self.assertLess(iterations1,
                        MAX_ITER - 1,
                        msg="GLM1: Too many iterations, didn't converge %s" %
                        iterations1)
        self.assertLess(iteration,
                        MAX_ITER - 1,
                        msg="GLM2: Too many iterations, didn't converge %s" %
                        iteration)

        nullDevExpected = nullDev1
        # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2,
        #     msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected))

        iterationExpected = iterations1
        # self.assertAlmostEqual(iteration, iterationExpected, delta=2,
        #     msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected))

        # coefficients is a list.
        coeff0 = coefficients[0]
        coeff0Expected = coefficients1[0]
        print "coeff0 pct delta:", "%0.3f" % (
            100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected))
        self.assertTrue(
            h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5),
            msg='GLM2 coefficient 0 %s is too different from GLM1 %s' %
            (coeff0, coeff0Expected))

        coeff2 = coefficients[2]
        coeff2Expected = coefficients1[2]
        print "coeff2 pct delta:", "%0.3f" % (
            100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected))
        self.assertTrue(
            h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5),
            msg='GLM2 coefficient 2 %s is too different from GLM1 %s' %
            (coeff2, coeff2Expected))

        # compare to known values GLM1 got for class 1 case, with these parameters
        # aucExpected = 0.8428
        if FAMILY == 'binomial':
            aucExpected = auc1
            self.assertAlmostEqual(
                auc,
                aucExpected,
                delta=10,
                msg='GLM2 auc %s is too different from GLM1 %s' %
                (auc, aucExpected))

        interceptExpected = intercept1
        print "intercept pct delta:", 100.0 * (
            abs(intercept) - abs(interceptExpected)) / abs(interceptExpected)
        self.assertTrue(h2o_util.approxEqual(intercept,
                                             interceptExpected,
                                             rel=0.5),
                        msg='GLM2 intercept %s is too different from GLM1 %s' %
                        (intercept, interceptExpected))

        # avg_errExpected = 0.2463
        avg_errExpected = err1
        # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected,
        #     msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected))

        # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold,
        #     msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35))

        #********************
        # Print comparison
        #********************
        interceptDelta = abs(abs(intercept1) - abs(intercept))
        cDelta = [
            abs(abs(a) - abs(b)) for a, b in zip(coefficients1, coefficients)
        ]

        def printit(self, a, b, c, d):
            pctDiff = abs(d / c) * 100
            print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \
                ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d)
            # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept")

        printit(self, "intercept", "", intercept1, interceptDelta)
        print "compare lengths coefficients1, coefficients, cDelta:", len(
            coefficients1), len(coefficients), len(cDelta)
        print "GLM1:", coefficients1
        print "GLM2:", coefficients
        print "cDelta:", cDelta

        for i, cValue in enumerate(coefficients):
            printit(self, "coefficient", "C" + str(i), cValue, cDelta[i])

        hexKey = 'B.hex'
        pctWrong = h2o_rf.predict_and_compare_csvs(modelKey,
                                                   hexKey,
                                                   predictHexKey,
                                                   csvSrcOutputPathname,
                                                   csvPredictPathname,
                                                   skipSrcOutputHeader,
                                                   skipPredictHeader,
                                                   translate=None,
                                                   y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        self.assertAlmostEqual(
            pctWrong,
            expectedPctWrong,
            delta=2.0,
            msg=
            "predicted pctWrong: %s should be small because we're predicting with training data %s"
            % (pctWrong, expectedPctWrong))
Beispiel #30
0
    def test_exec2_xorsum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax,
                 expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(
                    None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum,
                 expectedFpSum) = write_syn_dataset(csvPathname, rowCount,
                                                    colCount, expectedMin,
                                                    expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(
                    expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(
                    expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=3000,
                                               retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
                        start = time.time()
                        (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0],
                                                               execExpr,
                                                               resultKey='h',
                                                               timeoutSecs=300)
                        print r, 'exec took', time.time() - start, 'seconds'
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()
                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                            ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                            expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                            expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult != expectedUllSum:
                            raise Exception(
                                "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x"
                                % (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (
                                ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                        ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                    expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                    expectedFpSumAsLongLong, expectedFpSum)
Beispiel #31
0
    def test_enums_with_0_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            (' a', ' b',  'NA'),
            (' a', ' b',  '"NA"'),
            # only one enum?
            # the NA count has to get flipped if just one enum and 0
            (' a', ' b',  ''),
            (' a', ' a',  ''),
            # (' a', 'a',  '0'), # doesn't match my "single enum' check above
            (' a', ' b', ' 0'),
            # what about mixed NA and 0? doesn't happen?
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            # only one enum?
            (' a', ' b',  ''),
            (' a', ' b',  ''),
            ('a', 'b', ''),
            ('a', 'a', ''),
            # ('a', 'a', '0'),
            ('a', 'b', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (1000, 5, 'x.hex', choicesList[4], expectedList[4]),
            (1000, 5, 'x.hex', choicesList[5], expectedList[5]),
            (1000, 5, 'x.hex', choicesList[6], expectedList[6]),
            (1000, 5, 'x.hex', choicesList[7], expectedList[7]),
            (1000, 5, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 5, 'x.hex', choicesList[2], expectedList[2]),
            (1000, 5, 'x.hex', choicesList[1], expectedList[1]),
            (1000, 5, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?
            print "choices:", choices
        
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, choices)
            # force header=0 so the T/F strings don't get deduced to be headers
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, numRows=numRows, numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column

            for i in range(colCount):
                column = summaryResult['summaries'][i]

                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                # if it's just 0's with a single enum, the enums become NA, so the count is flipped
                self.assertEqual(nacnt, expectedNaCnt[i], "Column %s Expected %s. nacnt %s incorrect. choices: %s" % (i, expectedNaCnt[i], nacnt, choices))

                stats = column['stats']
                stattype= stats['type']
                self.assertEqual(stattype, 'Enum')

                # FIX! we should compare mean and sd to expected?
                cardinality = stats['cardinality']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                # cover the hacky two equal expected values
                hcnt = column['hcnt']
                if expected[0]==expected[1]:
                    self.assertEqual(hbrk, [expected[0]])
                    hcntTotal = hcnt[0]
                else: 
                    self.assertEqual(hbrk, [expected[0], expected[1]])
                    hcntTotal = hcnt[0] + hcnt[1]


                self.assertEqual(hcntTotal, rowCount - expectedNaCnt[i])

                self.assertEqual(rowCount, numRows, 
                    msg="numRows %s should be %s" % (numRows, rowCount))

            trial += 1
Beispiel #32
0
    def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList:

            if pathPrefix:
                csvPathname = pathPrefix + "/" + csvFilename
            else:
                csvPathname = csvFilename

            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
            if skipHeader:
                header = 1
            else:
                header = 0
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
                schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0])

                quantile = 0.5 if DO_MEDIAN else OTHER_Q
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2)
                qresult = q['result']
                qresult_single = q['result_single']
                qresult_iterations = q['iterations']
                qresult_interpolated = q['interpolated']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
                h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
                print h2o.dump_json(q)

                self.assertLess(qresult_iterations, 16, 
                    msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    pctile = stats['pctile']

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected')


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?
                    # hack?
                    maxErr = maxErr * 2
                    print "maxErr:", maxErr

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    ## ignore for blank colnames, issues with quoted numbers
                    # covtype is too big to do in scipy
                    if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data':
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=skipHeader, # important!!
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else OTHER_Q,
                            h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                scipyCol += 1

            trial += 1
    def test_summary2_percentile2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values
            (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values
            (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            legalValues = {}
            for x in range(expectedMin, expectedMax):
                legalValues[x] = x
        
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1)
            if h2o.verbose:
                print "summaryResult:", h2o.dump_json(summaryResult)

            summaries = summaryResult['summaries']
            scipyCol = 0
            for column in summaries:
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    e = .1 * rowCount
                    self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, 
                        msg="Bins not right. b: %s e: %s" % (b, e))

                print "pctile:", pctile
                print "maxs:", maxs
                self.assertEqual(maxs[0], expectedMax)
                print "mins:", mins
                self.assertEqual(mins[0], expectedMin)

                for v in pctile:
                    self.assertTrue(v >= expectedMin, 
                        "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin))
                    self.assertTrue(v <= expectedMax, 
                        "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax))
            
                eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0]
                if expectedMin==1:
                    eV = eV1
                elif expectedMin==0:
                    eV = [e-1 for e in eV1]
                elif expectedMin==2:
                    eV = [e+1 for e in eV1]
                else:
                    raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin)

            trial += 1

            # if colname!='' and expected[scipyCol]:
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    )
            scipyCol += 1
Beispiel #34
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            (
                'cars.csv',
                'c.hex',
                [
                    (None, None, None, None, None, None),
                    ('economy (mpg)', None, None, None, None, None),
                    ('cylinders', None, None, None, None, None),
                ],
            ),
            (
                'runifA.csv',
                'A.hex',
                [
                    (None, 1.00, 25.00, 50.00, 75.00, 100.0),
                    ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
                ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            (
                'runif.csv',
                'x.hex',
                [
                    (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                    ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                    ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                    ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
                ],
            ),
            (
                'runifB.csv',
                'B.hex',
                [
                    (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                    ('x', -100.00, -50.1, 0.974, 51.7, 100, 00),
                ],
            ),
            (
                'runifC.csv',
                'C.hex',
                [
                    (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                    ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
                ],
            ),
        ]

        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata',
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname,
                                     expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(
                    source_key=hex_key,
                    column=column['colname'],
                    quantile=quantile,
                    max_qbins=MAX_QBINS,
                    multiple_pass=2,
                    interpolation_type=7)  # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:",
                               q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype = stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype != 'Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                        mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                        sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct = [
                        0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9,
                        0.95, 0.99
                    ]
                    pctile = stats['pctile']

                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange / (MAX_QBINS - 2)
                    maxErr = 0.5 * expectedBin  # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(
                        mins[0],
                        expected[1],
                        tol=maxErr,
                        msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(
                        pctile[3],
                        expected[2],
                        tol=maxErr,
                        msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(
                        pctile[5],
                        expected[3],
                        tol=maxErr,
                        msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(
                        pctile[7],
                        expected[4],
                        tol=maxErr,
                        msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(
                        maxs[0],
                        expected[5],
                        tol=maxErr,
                        msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype != 'Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname != '' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                        )

                        if False and h2o_util.approxEqual(pctile[5],
                                                          0.990238116744,
                                                          tol=0.002,
                                                          msg='stop here'):
                            raise Exception("stopping to look")

                scipyCol += 1

            trial += 1
    def test_rf_predict3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1 == 1:
            y = 4  # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            translate = None
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        elif 1 == 0:
            y = 54  # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7
            }
            expectedPctWrong = 0.7
        else:
            y = 0  # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   schema='put',
                                                   returnFullPath=True)

        def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey + "=" + hex_key,
                          timeoutSecs=30)  # unneeded but interesting
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) +
                          "]",
                          timeoutSecs=30)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=model_key,
                data_key=hexKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time(
            ) - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex",
                                      csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey,
                                      csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(
                csvSrcOutputPathname,
                msg="Original",
                colIndex=0,
                translate=translate,
                skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput) = compare_csv_at_one_col(
                csvPredictPathname,
                msg="Predicted",
                colIndex=0,
                skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1 - skipSrcOutputHeader) !=
                (rowNum2 - skipPredictHeader)):
                raise Exception(
                    "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" %
                    (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput,
                                                predictOutput)):
                # if float(o)!=float(p):
                if str(o) != str(p):
                    if wrong == 10:
                        print "Not printing any more mismatches\n"
                    elif wrong < 10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s" % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 2.0:
                raise Exception(
                    "pctWrong too high. Expect < 2% error because it's reusing training data"
                )
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        kwargs = {
            'destination_key': 'rf_model',
            'response': response,
            'ntrees': trees,
            'classification': 1,
        }

        rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                      timeoutSecs=timeoutSecs,
                                      **kwargs)
        rfResult["drf_model"] = rfResult.pop("speedrf_model")
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y
        pctWrong = predict_and_compare_csvs(model_key='rf_model',
                                            hex_key=hexKey,
                                            translate=translate,
                                            y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2,
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now
        self.assertAlmostEqual(
            pctWrong,
            expectedPctWrong,
            delta=0.7,
            msg=
            "predicted pctWrong: %s should be small because we're predicting with training data"
            % pctWrong)
Beispiel #36
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5 * ROWS, 1, 'x.hex', 1, 20000,
             ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5 * ROWS, 1, 'x.hex', -5000, 0,
             ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1 * ROWS, 1, 'x.hex', -100000, 100000,
             ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1 * ROWS, 1, 'x.hex', -1, 1,
             ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]),
            (1 * ROWS, 1, 'A.hex', 1, 100,
             ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]),
            (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]),
            (1 * ROWS, 1, 'B.hex', 1, 10000,
             ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1 * ROWS, 1, 'B.hex', -100, 100,
             ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]),
            (1 * ROWS, 1, 'C.hex', 1, 100000,
             ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1 * ROWS, 1, 'C.hex', -101, 101,
             ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [
                co.base,
                len(co.bins),
                len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins,
                co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision,
                co.sigma, co.str_data, co.stride, co.type, co.zeros
            ]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(
                co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr = "[%s]" % ",".join(map(str, probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(algo='quantile',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][
                0]  # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                )
            h2o.nodes[0].remove_all_keys()
Beispiel #37
0
    def test_summary2_NY0(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            # only one enum?
            # the NA count has to get flipped if just one enum and 0
            (' a', ' b', ''),
            (' a', ' a', ''),
            # (' a', 'a',  '0'), # doesn't match my "single enum' check above
            (' a', ' b', ' 0'),
            # what about mixed NA and 0? doesn't happen?
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            # only one enum?
            ('a', 'b', ''),
            ('a', 'a', ''),
            # ('a', 'a', '0'),
            ('a', 'b', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (1000, 5, 'x.hex', choicesList[4], expectedList[4]),
            (1000, 5, 'x.hex', choicesList[5], expectedList[5]),
            (1000, 5, 'x.hex', choicesList[6], expectedList[6]),
            (1000, 5, 'x.hex', choicesList[7], expectedList[7]),
            (1000, 5, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 5, 'x.hex', choicesList[2], expectedList[2]),
            (1000, 5, 'x.hex', choicesList[1], expectedList[1]),
            (1000, 5, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, choices)
            # force header=0 so the T/F strings don't get deduced to be headers
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           header=0,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               noPrint=False,
                                               numRows=numRows,
                                               numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column

            for i in range(colCount):
                column = summaryResult['summaries'][i]

                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                # if it's just 0's with a single enum, the enums become NA, so the count is flipped
                self.assertEqual(
                    nacnt, expectedNaCnt[i],
                    "Column %s Expected %s. nacnt %s incorrect. choices: %s" %
                    (i, expectedNaCnt[i], nacnt, choices))

                stats = column['stats']
                stattype = stats['type']
                self.assertEqual(stattype, 'Enum')

                # FIX! we should compare mean and sd to expected?
                cardinality = stats['cardinality']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                # cover the hacky two equal expected values
                hcnt = column['hcnt']
                if expected[0] == expected[1]:
                    self.assertEqual(hbrk, [expected[0]])
                    hcntTotal = hcnt[0]
                else:
                    self.assertEqual(hbrk, [expected[0], expected[1]])
                    hcntTotal = hcnt[0] + hcnt[1]

                self.assertEqual(hcntTotal, rowCount - expectedNaCnt[i])

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="numRows %s should be %s" %
                                 (numRows, rowCount))

            trial += 1
Beispiel #38
0
    def test_GLM1_GLM2_train_pred_fvec(self):
        h2o.beta_features = False
        SYNDATASETS_DIR = h2o.make_syn_dir()

        trees = 15
        timeoutSecs = 120

        if 1 == 0:
            bucket = "home-0xdiag-datasets"
            csvPathname = "standard/covtype.data"
            hexKey = "covtype.data.hex"
            y = 54

        if 1 == 1:
            bucket = "home-0xdiag-datasets"
            csvPathname = "standard/covtype.shuffled.10pct.data"
            hexKey = "covtype.shuffled.10pct.data.hex"
            y = 54

        if 1 == 0:
            bucket = "smalldata"
            # no header
            csvPathname = "iris/iris.csv"
            y = 4

        predictHexKey = "predict.hex"
        predictCsv = "predict.csv"

        execHexKey = "A.hex"
        execCsv = "exec.csv"

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvExecPathname = SYNDATASETS_DIR + "/" + execCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema="put", returnFullPath=True)

        def predict_and_compare_csvs(model_key):
            start = time.time()
            predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey)
            print "runPredict end on ", hexKey, " took", time.time() - start, "seconds"
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, "predict.hex")

            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            (rowNum1, originalOutput) = compare_csv_last_col(
                csvExecPathname, msg="Original, after being exec'ed", skipHeader=True
            )
            (rowNum2, predictOutput) = compare_csv_last_col(csvPredictPathname, msg="Predicted", skipHeader=True)

            # no header on source
            if rowNum1 != rowNum2:
                raise Exception(
                    "original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \
                    %s"
                    % (rowNum1, rowNum2)
                )

            wrong = 0
            wrong0 = 0
            wrong1 = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)):
                o = float(o)
                p = float(p)
                if o != p:
                    msg = (
                        "Comparing original output col vs predicted. row %s differs. \
                        original: %s predicted: %s"
                        % (rowNum, o, p)
                    )
                    if p == 0.0 and wrong0 == 10:
                        print "Not printing any more predicted=0 mismatches"
                    elif p == 0.0 and wrong0 < 10:
                        print msg
                    if p == 1.0 and wrong1 == 10:
                        print "Not printing any more predicted=1 mismatches"
                    elif p == 1.0 and wrong1 < 10:
                        print msg

                    if p == 0.0:
                        wrong0 += 1
                    elif p == 1.0:
                        wrong1 += 1

                    wrong += 1

            print "wrong0:", wrong0
            print "wrong1:", wrong1
            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 16.0:
                raise Exception("pct wrong: %s too high. Expect < 16 pct error" % pctWrong)

        # *************************************************************************
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)
        h2o_cmd.runSummary(key=hexKey)

        # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
        trainKey = parseResult["destination_key"]
        CLASS = 1

        # just to check. are there any NA/constant cols?
        ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300)

        # **************************************************************************
        # first glm1
        h2o.beta_features = False
        # try ignoring the constant col to see if it makes a diff
        kwargs = {
            "lsm_solver": LSM_SOLVER,
            "standardize": STANDARDIZE,
            # 'y': 'C' + str(y),
            "y": "C" + str(y + 1),
            "family": FAMILY,
            "n_folds": 1,
            "max_iter": MAX_ITER,
            "beta_epsilon": BETA_EPSILON,
        }

        if USE_EXEC:
            # maybe go back to simpler exec here. this was from when Exec failed unless this was used
            execExpr = "A.hex=%s" % trainKey
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            # class 1=1, all else 0
            if FAMILY == "binomial":
                execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, CLASS)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            aHack = {"destination_key": "A.hex"}
        else:
            # since we're not using predict, we can use case_mode/val to get the binomial output class
            if FAMILY == "binomial":
                kwargs.update({"case_mode": "=", "case": 1})
            aHack = {"destination_key": hexKey}

        timeoutSecs = 120
        kwargs.update({"case_mode": "=", "case": 1})

        kwargs.update({"alpha": TRY_ALPHA, "lambda": TRY_LAMBDA})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # bad model (auc=0.5)
        # kwargs.update({'alpha': 0.0, 'lambda': 0.0})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs)
        # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right
        glm["GLMModel"]["GLMParams"]["family"] = FAMILY
        print "glm1 end on ", csvPathname, "took", time.time() - start, "seconds"
        (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
        iterations1 = glm["GLMModel"]["iterations"]
        err1 = glm["GLMModel"]["validations"][0]["err"]
        nullDev1 = glm["GLMModel"]["validations"][0]["nullDev"]
        resDev1 = glm["GLMModel"]["validations"][0]["resDev"]

        if FAMILY == "binomial":
            classErr1 = glm["GLMModel"]["validations"][0]["classErr"]
            auc1 = glm["GLMModel"]["validations"][0]["auc"]

        # **************************************************************************
        # then glm2
        h2o.beta_features = True
        kwargs = {
            # 'ignored_cols': 'C29',
            "standardize": STANDARDIZE,
            "classification": 1 if FAMILY == "binomial" else 0,
            # 'response': 'C' + str(y),
            "response": "C" + str(y + 1),
            "family": FAMILY,
            "n_folds": 1,
            "max_iter": MAX_ITER,
            "beta_epsilon": BETA_EPSILON,
        }

        timeoutSecs = 120

        if USE_EXEC:
            # maybe go back to simpler exec here. this was from when Exec failed unless this was used
            execExpr = "B.hex=%s" % trainKey
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            # class 1=1, all else 0
            if FAMILY == "binomial":
                execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, CLASS)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            bHack = {"destination_key": "B.hex"}
        else:
            # since we're not using predict, we can use case_mode/val to get the binomial output class
            if FAMILY == "binomial":
                kwargs.update({"case_mode": "=", "case_val": 1})
            bHack = {"destination_key": hexKey}

        kwargs.update({"alpha": TRY_ALPHA, "lambda": TRY_LAMBDA})

        #        kwargs.update({'alpha': 0.0, 'lambda': 0})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # bad model (auc=0.5)
        # kwargs.update({'alpha': 0.0, 'lambda': 0.0})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs)
        print "glm2 end on ", csvPathname, "took", time.time() - start, "seconds"
        (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # **************************************************************************

        modelKey = glm["glm_model"]["_key"]
        avg_err = glm["glm_model"]["submodels"][0]["validation"]["avg_err"]
        best_threshold = glm["glm_model"]["submodels"][0]["validation"]["best_threshold"]
        iteration = glm["glm_model"]["submodels"][0]["iteration"]
        resDev = glm["glm_model"]["submodels"][0]["validation"]["residual_deviance"]
        nullDev = glm["glm_model"]["submodels"][0]["validation"]["null_deviance"]
        if FAMILY == "binomial":
            auc = glm["glm_model"]["submodels"][0]["validation"]["auc"]

        self.assertLess(iterations1, MAX_ITER - 1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1)
        self.assertLess(iteration, MAX_ITER - 1, msg="GLM2: Too many iterations, didn't converge %s" % iteration)

        nullDevExpected = nullDev1
        self.assertAlmostEqual(
            nullDev,
            nullDevExpected,
            delta=2,
            msg="GLM2 nullDev %s is too different from GLM1 %s" % (nullDev, nullDevExpected),
        )

        iterationExpected = iterations1
        self.assertAlmostEqual(
            iteration,
            iterationExpected,
            delta=2,
            msg="GLM2 iteration %s is too different from GLM1 %s" % (iteration, iterationExpected),
        )

        # coefficients is a list.
        coeff0 = coefficients[0]
        coeff0Expected = coefficients1[0]
        print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected))
        self.assertTrue(
            h2o_util.approx_equal(coeff0, coeff0Expected, 0.01),
            msg="GLM2 coefficient 0 %s is too different from GLM1 %s" % (coeff0, coeff0Expected),
        )

        coeff2 = coefficients[2]
        coeff2Expected = coefficients1[2]
        print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected))
        self.assertTrue(
            h2o_util.approx_equal(coeff2, coeff2Expected, 0.01),
            msg="GLM2 coefficient 2 %s is too different from GLM1 %s" % (coeff2, coeff2Expected),
        )

        # compare to known values GLM1 got for class 1 case, with these parameters
        # aucExpected = 0.8428
        if FAMILY == "binomial":
            aucExpected = auc1
            self.assertAlmostEqual(
                auc, aucExpected, delta=10, msg="GLM2 auc %s is too different from GLM1 %s" % (auc, aucExpected)
            )

        interceptExpected = intercept1
        print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected)) / abs(interceptExpected)
        self.assertTrue(
            h2o_util.approx_equal(intercept, interceptExpected, 0.01),
            msg="GLM2 intercept %s is too different from GLM1 %s" % (intercept, interceptExpected),
        )

        # avg_errExpected = 0.2463
        avg_errExpected = err1
        self.assertAlmostEqual(
            avg_err,
            avg_errExpected,
            delta=0.05 * avg_errExpected,
            msg="GLM2 avg_err %s is too different from GLM1 %s" % (avg_err, avg_errExpected),
        )

        self.assertAlmostEqual(
            best_threshold,
            0.35,
            delta=0.01 * best_threshold,
            msg="GLM2 best_threshold %s is too different from GLM1 %s" % (best_threshold, 0.35),
        )

        predict_and_compare_csvs(model_key=modelKey)
Beispiel #39
0
    def test_mixed_int_enum_many(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # this should be a sorted list for comparing to hbrk in the histogram in h2o summary?
        enumList = ['abc', 'def', 'ghi']
        # numbers 1 and 2 may not be counted as NAs correctly? what about blank space?
        intList = [0, 1, 2, '']
        expectedList = ['abc', 'def', 'ghi']

        tryList = [
            # not sure about this case
            # some of the cases interpret as ints now (not as enum)
            (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], False),
            # colname, (min, COLS5th, 50th, 75th, max)
            (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2],
             intList[0:1], True),
            # fails this case
            (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1],
             intList[0:1], True),
            (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1],
             True),
            (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2],
             intList[0:2], True),
            # this case seems to fail
            (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], True),
            # this seems wrong also
            (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2],
             True),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, enumChoices, enumExpected,
             intChoices, resultIsEnum) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, enumChoices,
                                              intChoices)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           header=0,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\nTrial:", trial, csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               noPrint=False,
                                               numRows=numRows,
                                               numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            for i in range(colCount):
                column = summaryResult['summaries'][i]

                colname = column['colname']
                coltype = column['type']

                stats = column['stats']
                stattype = stats['type']
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        stattype, 'Enum',
                        "trial %s: Expecting summaries/stats/type to be Enum for %s col colname %s"
                        % (trial, i, colname))

                # FIX! we should compare mean and sd to expected?
                # assume enough rows to hit all of the small # of choices
                if ENABLE_ASSERTS and resultIsEnum:
                    # not always there
                    cardinality = stats['cardinality']
                    self.assertEqual(
                        cardinality,
                        len(enumChoices),
                        msg="trial %s: cardinality %s should be %s" %
                        (trial, cardinality, len(enumChoices)))

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                # assume I create the list above in the same order that h2o will show the order. sorted?
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hbrk, enumChoices)

                hcnt = column['hcnt']

                hcntTotal = sum(hcnt)
                numRowsCreated = rowCount + len(intChoices)
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hcntTotal,
                                     numRowsCreated - expectedNaCnt[i])

                self.assertEqual(numRows,
                                 numRowsCreated,
                                 msg="trial %s: numRows %s should be %s" %
                                 (trial, numRows, numRowsCreated))

                nacnt = column['nacnt']
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        nacnt, expectedNaCnt[i],
                        "trial %s: Column %s Expected %s. nacnt %s incorrect" %
                        (trial, i, expectedNaCnt[i], nacnt))

                # FIX! no checks for the case where it got parsed as int column!
            trial += 1
Beispiel #40
0
    def test_GLM_both(self):
        h2o.beta_features = True
        if (1==1):
            csvFilenameList = [
                ('logreg', 'benign.csv', 'binomial', 3, 10),
                # col is zero based
                # FIX! what's wrong here? index error
                ## ('uis.dat', 'binomial', 8, 5, False),
                ## ('pros.dat', 'binomial', 1, 10, False),
                ## ('chdage.dat', 'binomial', 2, 5, True),
                ## ('icu.dat', 'binomial', 1, 10, False),
                # how to ignore 6? '1,2,3,4,5', False),
                ## ('clslowbwt.dat', 'binomial', 7, 10, False),
                # ('cgd.dat', 'gaussian', 12, 5, False),
                # ('meexp.dat', 'gaussian', 3, 10, None),
            ]
        else:
            csvFilenameList = [
                # leave out ID and birth weight
                ('logreg', 'benign.csv', 'gaussian', 3, 10),
                (None, 'icu.dat', 'binomial', 1, 10),
                # need to exclude col 0 (ID) and col 10 (bwt)
                # but -x doesn't work..so do 2:9...range doesn't work? FIX!
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'lowbwt.dat', 'binomial', 1, 10),
                (None, 'lowbwtm11.dat', 'binomial', 1, 10),
                (None, 'meexp.dat', 'gaussian', 3, 10),
                # FIX! does this one hang in R?
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'pbc.dat', 'gaussian', 1, 10),
                (None, 'pharynx.dat', 'gaussian', 12, 10),
                (None, 'uis.dat', 'binomial', 8, 10),
            ]

        trial = 0
        for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList:

            # FIX! do something about this file munging
            if offset:
                csvPathname1 = offset + "/" + csvFilename
            else:
                csvPathname1 = 'logreg/umass_statdata/' + csvFilename

            fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True)

            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv'
            h2o_util.file_clean_for_R(fullPathname, csvPathname2)

            # we can inspect this to get the number of cols in the dataset (trust H2O here)
            parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10)
            # we could specify key2 above but this is fine
            destination_key = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, destination_key)

            if h2o.beta_features:
                num_cols = inspect['numCols']
                num_rows = inspect['numRows']
            else:
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']

            print "num_cols", num_cols, "num_rows", num_rows
            ##  print h2o.dump_json(inspect)

            # create formula and the x for H2O GLM
            formula = "V" + str(y+1) + " ~ "
            x = None
            col_names = ""
            for c in range(0,num_cols):
                if csvFilename=='clslowbwt.dat' and c==6:
                    print "Not including col 6 for this dataset from x"
                if csvFilename=='benign.csv' and (c==0 or c==1):
                    print "Not including col 0,1 for this dataset from x"
                else:
                    # don't add the output col to the RHS of formula
                    if x is None: 
                        col_names += "V" + str(c+1)
                    else: 
                        col_names += ",V" + str(c+1)

                    if c!=y:
                        if x is None: 
                            x = str(c)
                            formula += "V" + str(c+1)
                        else: 
                            x += "," + str(c)
                            formula += "+V" + str(c+1)

            print 'formula:', formula
            print 'col_names:', col_names

        
            print 'x:', x

            if h2o.beta_features:
                kwargs = { 
                    'n_folds': 0, 
                    'response': y, 
                    # what about x?
                    'family': family, 
                    'alpha': 0, 
                    'lambda': 0,
                    'beta_epsilon': 1.0E-4, 
                    'max_iter': 50 }
            else:
                kwargs = { 
                    'n_folds': 0, 
                    'y': y, 
                    'x': x,
                    'family': family, 
                    'alpha': 0, 
                    'lambda': 1e-4,
                    'beta_epsilon': 1.0E-4, 
                    'max_iter': 50 }

            if csvFilename=='benign.csv':
                kwargs['ignored_cols'] = '0,1'

            if csvFilename=='clslowbwt.dat':
                kwargs['ignored_cols'] = '6'

            
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds'
            h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs)
            # now do it thru R and compare
            (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults)

            trial += 1
            print "\nTrial #", trial
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
            # parse setup error ? supposedly fixed now
            # (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]),
            (10000, 1, 'x.hex', -100000, 100000,
             ['C1', None, None, None, None, None]),
            (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]),
            (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None,
                                           None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxErr = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxErr = 1.05 * maxErr

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            # column 0 not used here
            assert len(expected) == 6
            co = h2o_cmd.runSummary(key=hex_key,
                                    column=0,
                                    expected=expected[1:],
                                    maxDelta=maxErr)
            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "maxErr", maxErr
            if co.label != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.99,
                    h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9],

                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                )
Beispiel #42
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 2, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 100, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 1000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?
        
            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta


            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7,
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(qresult_iterations, 16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")


            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = twoDecimals(pctile)
            mx = twoDecimals(maxs)
            mn = twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if DO_TRY_SCIPY and colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                print scipyCol, pctile[10]
                generate_scipy_comparison(csvPathnameFull, col=scipyCol,
                     # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single)
                    h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult)



            h2i.delete_keys_at_all_nodes()
Beispiel #43
0
    def test_summary2_int2B(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100000, 1, 'B.hex', 2533255332, 2633256000,   ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) 
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            if expected[0]:
                self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # apparently we can't estimate any more
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0 
    def test_summary2_NY0(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        choicesList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            (' N', ' Y', ' 0'),
            (' n', ' y', ' 0'),
            (' F', ' T', ' 0'),
            (' f', ' t', ' 0'),
        ]

        # white space is stripped
        expectedList = [
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
            ('N', 'Y', '0'),
            ('n', 'y', '0'),
            ('F', 'T', '0'),
            ('f', 't', '0'),
        ]

        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100, 200, 'x.hex', choicesList[4], expectedList[4]),
            (100, 200, 'x.hex', choicesList[5], expectedList[5]),
            (100, 200, 'x.hex', choicesList[6], expectedList[6]),
            (100, 200, 'x.hex', choicesList[7], expectedList[7]),
            (100, 200, 'x.hex', choicesList[3], expectedList[3]),
            (1000, 200, 'x.hex', choicesList[2], expectedList[2]),
            (10000, 200, 'x.hex', choicesList[1], expectedList[1]),
            (100000, 200, 'x.hex', choicesList[0], expectedList[0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, choices, expected) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, choices)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(colCount):
                # walks across the columns triggering a summary on the col desired
                # runSummary returns a column object now. inspect and parse don't. They return json.
                # maybe eventually will make them return object? But I also pass expected stuff to them
                # should I pass expected to summary? no, more complex?
                co = h2o_cmd.runSummary(key=hex_key, column=i)
                print co.label, co.type, co.missing_count, co.domain, sum(
                    co.histogram_bins)

                print "\nComparing column %s to expected" % i
                self.assertEqual(expectedNaCnt[i], co.missing_count, "Column %s Expected %s. missing: %s is incorrect" % \
                    (i, expectedNaCnt[i], co.missing_count))
                self.assertEqual(rowCount - expectedNaCnt[i],
                                 sum(co.histogram_bins))

            h2p.green_print("\nDone with trial", trial)
            trial += 1

            h2i.delete_keys_at_all_nodes()
Beispiel #45
0
    def test_GLM1_GLM2_predict(self):
        # h2b.browseTheCloud()
        h2o.beta_features = False
        SYNDATASETS_DIR = h2o.make_syn_dir()

        trees = 15
        timeoutSecs = 120
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1==0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            y = 54
            expectedPctWrong = 0
        
        if 1==0:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            y = 54
            expectedPctWrong = 0

        if 1==1:
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            bucket = 'smalldata'
            # no header
            csvPathname = 'iris/iris.csv'
            hexKey = 'iris.hex'
            y = 4
            expectedPctWrong = 26
            

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True)

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        h2o_cmd.runSummary(key=hexKey)

        # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
        trainKey = parseResult['destination_key']

        # just to check. are there any NA/constant cols?
        ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

        #**************************************************************************
        # first glm1
        h2o.beta_features = False
        CLASS = 1
        # try ignoring the constant col to see if it makes a diff
        kwargs = {
            'lsm_solver': LSM_SOLVER,
            'standardize': STANDARDIZE,
            'y': 'C' + str(y+1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON,
            'case': CLASS,
            'case_mode': '=',
         }

        timeoutSecs = 120
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right
        glm['GLMModel']['GLMParams']['family'] = FAMILY
        print "glm1 end on ", csvPathname, 'took', time.time() - start, 'seconds'
        (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
        iterations1 = glm['GLMModel']['iterations']
        err1 = glm['GLMModel']['validations'][0]['err']
        nullDev1 = glm['GLMModel']['validations'][0]['nullDev']
        resDev1 = glm['GLMModel']['validations'][0]['resDev']

        if FAMILY == 'binomial':
            classErr1 = glm['GLMModel']['validations'][0]['classErr']
            auc1 = glm['GLMModel']['validations'][0]['auc']

        #**************************************************************************
        # then glm2
        h2o.beta_features = True
        kwargs = {
            # 'ignored_cols': 'C29',
            'standardize': STANDARDIZE,
            'response': 'C' + str(y+1),
            'family': FAMILY,
            'n_folds': 0,
            'max_iter': MAX_ITER,
            'beta_epsilon': BETA_EPSILON}

        timeoutSecs = 120

        # class 1=1, all else 0
        if FAMILY == 'binomial':
            execExpr="B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % (trainKey, y+1, trainKey, y+1, CLASS)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            bHack = {'destination_key': 'B.hex'}
        else:
            bHack = parseResult
        kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA})

#        kwargs.update({'alpha': 0.0, 'lambda': 0})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
        # bad model (auc=0.5)
        # kwargs.update({'alpha': 0.0, 'lambda': 0.0})
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs)
        print "glm2 end on ", csvPathname, 'took', time.time() - start, 'seconds'
        (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        #**************************************************************************
        modelKey = glm['glm_model']['_key']
        submodels = glm['glm_model']['submodels']
        # hackery to make it work when there's just one
        validation = submodels[-1]['validation']
        iteration = submodels[-1]['iteration']

        resDev = validation['residual_deviance']
        nullDev = validation['null_deviance']
        if FAMILY == 'binomial':
            auc = validation['auc']

        self.assertLess(iterations1, MAX_ITER-1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) 
        self.assertLess(iteration, MAX_ITER-1, msg="GLM2: Too many iterations, didn't converge %s" % iteration)

        nullDevExpected = nullDev1
        # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, 
        #     msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected))

        iterationExpected = iterations1
        # self.assertAlmostEqual(iteration, iterationExpected, delta=2, 
        #     msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected))


        # coefficients is a list.
        coeff0 = coefficients[0]
        coeff0Expected = coefficients1[0]
        print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected))/abs(coeff0Expected))
        self.assertTrue(h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5),
            msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected))

        
        coeff2 = coefficients[2]
        coeff2Expected = coefficients1[2]
        print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected))/abs(coeff2Expected))
        self.assertTrue(h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5),
            msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected))

        # compare to known values GLM1 got for class 1 case, with these parameters
        # aucExpected = 0.8428
        if FAMILY == 'binomial':
            aucExpected = auc1
            self.assertAlmostEqual(auc, aucExpected, delta=10, 
                msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected))

        interceptExpected = intercept1
        print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected))/abs(interceptExpected)
        self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5),
            msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected))


        # avg_errExpected = 0.2463
        avg_errExpected = err1
        # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, 
        #     msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected))

        # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, 
        #     msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35))

        #********************
        # Print comparison
        #********************
        interceptDelta = abs(abs(intercept1) - abs(intercept))
        cDelta = [abs(abs(a) - abs(b)) for a,b in zip(coefficients1, coefficients)]

        def printit(self, a, b, c, d):
            pctDiff = abs(d/c)*100
            print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \
                ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d)
            # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept")

        printit(self, "intercept", "", intercept1, interceptDelta)
        print "compare lengths coefficients1, coefficients, cDelta:", len(coefficients1), len(coefficients), len(cDelta)
        print "GLM1:", coefficients1
        print "GLM2:", coefficients
        print "cDelta:", cDelta

        for i,cValue in enumerate(coefficients):
            printit(self , "coefficient", "C"+str(i), cValue, cDelta[i])

        hexKey = 'B.hex'
        pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, 
            csvSrcOutputPathname, csvPredictPathname, 
            skipSrcOutputHeader, skipPredictHeader,
            translate=None, y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, 
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 2.0,
            msg="predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
Beispiel #46
0
def do_scipy_glm(self, bucket, csvPathname, L, family='binomial'):
    
    h2p.red_print("Now doing sklearn")
    h2p.red_print("\nsee http://scikit-learn.org/0.11/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression")

    import numpy as np
    import scipy as sp
    from sklearn.linear_model import LogisticRegression
    from numpy import loadtxt

    csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

    # make sure it does fp divide
    C = 1/(L+0.0)
    print "C regularization:", C
    dataset = np.loadtxt( 
        open(csvPathnameFull,'r'),
        skiprows=1, # skip the header
        delimiter=',',
        dtype='float');

    print "\ncsv read for training, done"

    n_features = len(dataset[0]) - 1;
    print "n_features:", n_features

    # don't want ID (col 0) or CAPSULE (col 1)
    # get CAPSULE
    target = [x[1] for x in dataset]
    # slice off the first 2
    train  = np.array ( [x[2:] for x in dataset] )


    n_samples, n_features = train.shape
    print "n_samples:", n_samples, "n_features:",  n_features

    print "histogram of target"
    print sp.histogram(target,3)

    print "len(train):",  len(train)
    print "len(target):", len(target)
    print "dataset shape:", dataset.shape

    if family!='binomial':
        raise Exception("Only have binomial logistic for scipy")
    print "\nTrying l2"
    clf2 = LogisticRegression(
        C=C,
        dual=False, 
        fit_intercept=True, 
        intercept_scaling=1, 
        penalty='l2', 
        tol=0.0001);

    # train the classifier
    start = time.time()
    clf2.fit(train, target)
    print "L2 fit took", time.time() - start, "seconds"

    # print "coefficients:", clf2.coef_
    cstring = "".join([("%.5e  " % c) for c in clf2.coef_[0]])
    h2p.green_print("sklearn L2 C", C)
    h2p.green_print("sklearn coefficients:", cstring)
    h2p.green_print("sklearn intercept:", "%.5e" % clf2.intercept_[0])
    h2p.green_print("sklearn score:", clf2.score(train,target))

    print "\nTrying l1"
    clf1 = LogisticRegression(
        C=C,
        dual=False, 
        fit_intercept=True, 
        intercept_scaling=1, 
        penalty='l1', 
        tol=0.0001);

    # train the classifier
    start = time.time()
    clf1.fit(train, target)
    print "L1 fit took", time.time() - start, "seconds"

    # print "coefficients:", clf1.coef_
    cstring = "".join([("%.5e  " % c) for c in clf1.coef_[0]])
    h2p.green_print("sklearn L1 C", C)
    h2p.green_print("sklearn coefficients:", cstring)
    h2p.green_print("sklearn intercept:", "%.5e" % clf1.intercept_[0])
    h2p.green_print("sklearn score:", clf1.score(train,target))

    # attributes are accessed in the normal python way
    dx = clf1.__dict__
    dx.keys()
Beispiel #47
0
    def test_rf_predict3_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        timeoutSecs = 600
        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'

        if 1==1:
            y = 4 # last col
            response = 'response'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            bucket = 'smalldata'
            csvPathname = 'iris/iris2.csv'
            hexKey = 'iris2.csv.hex'
            # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0}
            # No translate because we're using an Exec to get the data out?, and that loses the encoding?
            translate = None
            # one wrong will be 0.66667. I guess with random, that can happen?
            expectedPctWrong = 0.7

        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 40
            # try smaller data set compared to covtype
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.shuffled.10pct.data'
            hexKey = 'covtype.shuffled.10pct.data.hex'
            # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        elif 1==0:
            y = 54 # last col
            response = 'C55'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
            hexKey = 'covtype.data.hex'
            translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
            expectedPctWrong = 0.7
        else:
            y = 0 # first col
            response = 'C1'
            skipSrcOutputHeader = 1
            skipPredictHeader = 1
            trees = 6
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist_training.hex'
            translate = { \
                '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \
                '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 }
            expectedPctWrong = 0.7

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True)

        def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=hexKey, destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
                                                               msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
                                                               msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
                raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                # if float(o)!=float(p):
                if str(o)!=str(p):
                    if wrong==10:
                        print "Not printing any more mismatches\n"
                    elif wrong<10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s"  % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 2.0:
                raise Exception("pctWrong too high. Expect < 2% error because it's reusing training data")
            return pctWrong

        #*****************************************************************************

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        kwargs = {
            'destination_key': 'rf_model',
            'response': response,
            'ntrees': trees,
            'classification': 1,
            }


        rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        rfResult["drf_model"] = rfResult.pop("speedrf_model")
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key."
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"
        print "y=", y
        pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y)

        # we are predicting using training data...so error is really low
        # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, 
        #     msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error))
        # can be zero if memorized (iris is either 0 or 0.667?)
        # just make delta 0.7 for now
        self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 0.7,
                               msg="predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
Beispiel #48
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]
            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    )

            h2o.nodes[0].remove_all_keys()
Beispiel #49
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            ('cars.csv', 'c.hex', [
                (None, None,None,None,None,None),
                ('economy (mpg)', None,None,None,None,None),
                ('cylinders', None,None,None,None,None),
            ],
            ),
            ('runifA.csv', 'A.hex', [
                (None,  1.00, 25.00, 50.00, 75.00, 100.0),
                ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
            ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            ('runif.csv', 'x.hex', [
                (None,  1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
            ],
            ),
            ('runifB.csv', 'B.hex', [
                (None,  1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                ('x', -100.00, -50.1, 0.974, 51.7, 100,00),
            ],
            ),

            ('runifC.csv', 'C.hex', [
                (None,  1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
            ],
            ),
        ]


        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname,
                schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):",  h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
                    pctile = stats['pctile']


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0
                    

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname!='' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                        if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'):
                            raise Exception("stopping to look")
                                


                scipyCol += 1

            trial += 1
    def test_summary2_uniform_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, 
                msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount))


            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                
                e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b, e, delta=2*e,
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
Beispiel #51
0
    def test_summary2_uniform_w_NA(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0,
                                          20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                          -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                       1.00)),
            (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                        100.0)),
            (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                          7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                           100, 00)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                           75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28,
                                           100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               noPrint=False,
                                               max_qbins=MAX_QBINS,
                                               numRows=numRows,
                                               numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1 + NA_ROW_RATIO) * rowCount,
                             numRows,
                             msg="numRows %s should be %s" %
                             (numRows, (1 + NA_ROW_RATIO) * rowCount))

            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?

                e = rowCount / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b,
                                       e,
                                       delta=2 * e,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (100000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (100000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (100000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (100000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) 
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i!=0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                    h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i]))
                    if not result:
                        raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold))
                    h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, 
                        msg='exec percentile: %s too different from expected: %s' % (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1==0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2') 
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols,1)
                    self.assertEqual(numRows,len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                    )

            h2o.nodes[0].remove_all_keys()
Beispiel #53
0
def do_statsmodels_glm(self, bucket, csvPathname, L, family='gaussian'):
    
    h2p.red_print("Now doing statsmodels")
    h2p.red_print("http://statsmodels.sourceforge.net/devel/glm.html#module-reference")
    h2p.red_print("http://statsmodels.sourceforge.net/devel/generated/statsmodels.genmod.generalized_linear_model.GLM.html")

    import numpy as np
    import scipy as sp
    from numpy import loadtxt
    import statsmodels as sm

    csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

    if 1==1:
        dataset = np.loadtxt( 
            open(csvPathnameFull,'r'),
            skiprows=1, # skip the header
            delimiter=',',
            dtype='float');

    # skipping cols from the begining... (ID is col 1)
    # In newer versions of Numpy, np.genfromtxt can take an iterable argument, 
    # so you can wrap the file you're reading in a generator that generates lines, 
    # skipping the first N columns. If your numbers are comma-separated, that's something like
    if 1==0:
        f = open(csvPathnameFull,'r'),
        np.genfromtxt(
            (",".join(ln.split()[1:]) for ln in f),
            skiprows=1, # skip the header
            delimiter=',',
            dtype='float');

    print "\ncsv read for training, done"

    # data is last column
    # drop the output
    n_features = len(dataset[0]) - 1;
    print "n_features:", n_features

    # don't want ID (col 0) or CAPSULE (col 1)
    # get CAPSULE
    target = [x[1] for x in dataset]
    # slice off the first 2
    train  = np.array ( [x[2:] for x in dataset] )

    n_samples, n_features = train.shape
    print "n_samples:", n_samples, "n_features:",  n_features

    print "histogram of target"
    print sp.histogram(target,3)

    print "len(train):",  len(train)
    print "len(target):", len(target)
    print "dataset shape:", dataset.shape

    if family!='gaussian':
        raise Exception("Only have gaussian logistic for scipy")

    # train the classifier
    gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian(sm_api.families.links.log))
    start = time.time()
    gauss_log_results = gauss_log.fit()
    print "sm_api.GLM took", time.time() - start, "seconds"
    print gauss_log_results.summary()
    def test_glm_predict3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        trees = 15
        timeoutSecs = 120
        if 1==1:
            csvPathname = 'mnist/mnist_training.csv.gz'
            hexKey = 'mnist.hex'
        else:
            # try smaller data set
            csvPathname = 'mnist/mnist_testing.csv.gz'
            hexKey = 'mnist.hex'

        predictHexKey = 'predict_0.hex'
        predictCsv = 'predict_0.csv'
        actualCsv = 'actual_0.csv'
        bucket = 'home-0xdiag-datasets'

        csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv
        csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv
        # for using below in csv reader
        csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True)


        def predict_and_compare_csvs(model_key, hex_key, translate):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
            if HAS_HEADER:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key
            
            h2e.exec_expr(execExpr="Z.hex="+hex_key+"[0]", timeoutSecs=30)
            start = time.time()
            # predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key="P.hex",
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            # depending what you use, may need to set these to 0 or 1
            skipSrcOutputHeader = 1
            skipPredictHeader= 1
            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
                msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname, 
                msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
                raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            wrong0 = 0
            wrong1 = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                o = float(o)
                p = float(p)
                if o!=p:
                    msg = "Comparing original output col vs predicted. row %s differs. \
                        original: %s predicted: %s"  % (rowNum, o, p)
                    if p==0.0 and wrong0==10:
                        print "Not printing any more predicted=0 mismatches"
                    elif p==0.0 and wrong0<10:
                        print msg
                    if p==1.0 and wrong1==10:
                        print "Not printing any more predicted=1 mismatches"
                    elif p==1.0 and wrong1<10:
                        print msg

                    if p==0.0:
                        wrong0 += 1
                    elif p==1.0:
                        wrong1 += 1

                    wrong += 1

            print "wrong0:", wrong0
            print "wrong1:", wrong1
            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # digit 3 with no regularization got around 5%. 0 gets < 2%
            if pctWrong > 6.0:
                raise Exception("pct wrong too high. Expect < 6% error")

        #*************************************************************************
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." 
        print "Does this work? (feeding in same data key)if you're predicting, "
        print "don't you need one less column (the last is output?)"
        print "WARNING: max_iter set to 8 for benchmark comparisons"

        # excessive, trying each digit one at a time, but hey a good test
        # limit the ierations so it doesn't take so long. 4 minutes per digit?
        max_iter = 4
        # for y in [0,1,2,3,4,5,6,7,8,9]:
        # for y in [0,3,7]:
        for case in [0,3,7]:

            translate = {}
            for i in range(10):
                if i == case: 
                    translate[i] = 1.0
                else:
                    translate[i] = 0.0

            print "translate:", translate
            kwargs = {
                'x': "",
                'y': 0,
                'family': 'binomial',
                'link': 'logit',
                'n_folds': 1,
                'case_mode': '=',
                'case': case, # zero should predict to 1, 2-9 should predict to 0
                'max_iter': max_iter,
                'beta_epsilon': 1e-3}

            timeoutSecs = 120


            # L2 
            start = time.time()
            kwargs.update({'alpha': 0, 'lambda': 0})
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
            predict_and_compare_csvs(model_key=glm['destination_key'], hex_key=hexKey, translate=translate)

            # Elastic
            kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds'
            # okay for some coefficients to go to zero!
            h2o_glm.simpleCheckGLM(self, glm, 13, allowZeroCoeff=True, **kwargs)
            predict_and_compare_csvs(model_key=glm['destination_key'], hex_key=hexKey, translate=translate)

            # L1
            kwargs.update({'alpha': 1, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds'
            # okay for some coefficients to go to zero!
            h2o_glm.simpleCheckGLM(self, glm, 13, allowZeroCoeff=True, **kwargs)
            predict_and_compare_csvs(model_key=glm['destination_key'], hex_key=hexKey, translate=translate)
Beispiel #55
0
def do_statsmodels_glm(self, bucket, csvPathname, L, family='gaussian'):

    h2p.red_print("Now doing statsmodels")
    h2p.red_print(
        "http://statsmodels.sourceforge.net/devel/glm.html#module-reference")
    h2p.red_print(
        "http://statsmodels.sourceforge.net/devel/generated/statsmodels.genmod.generalized_linear_model.GLM.html"
    )

    import numpy as np
    import scipy as sp
    from numpy import loadtxt
    import statsmodels as sm

    csvPathnameFull = h2i.find_folder_and_filename(bucket,
                                                   csvPathname,
                                                   returnFullPath=True)

    if 1 == 1:
        dataset = np.loadtxt(
            open(csvPathnameFull, 'r'),
            skiprows=1,  # skip the header
            delimiter=',',
            dtype='float')

    # skipping cols from the begining... (ID is col 1)
    # In newer versions of Numpy, np.genfromtxt can take an iterable argument,
    # so you can wrap the file you're reading in a generator that generates lines,
    # skipping the first N columns. If your numbers are comma-separated, that's something like
    if 1 == 0:
        f = open(csvPathnameFull, 'r'),
        np.genfromtxt(
            (",".join(ln.split()[1:]) for ln in f),
            skiprows=1,  # skip the header
            delimiter=',',
            dtype='float')

    print "\ncsv read for training, done"

    # data is last column
    # drop the output
    n_features = len(dataset[0]) - 1
    print "n_features:", n_features

    # don't want ID (col 0) or CAPSULE (col 1)
    # get CAPSULE
    target = [x[1] for x in dataset]
    # slice off the first 2
    train = np.array([x[2:] for x in dataset])

    n_samples, n_features = train.shape
    print "n_samples:", n_samples, "n_features:", n_features

    print "histogram of target"
    print sp.histogram(target, 3)

    print "len(train):", len(train)
    print "len(target):", len(target)
    print "dataset shape:", dataset.shape

    if family != 'gaussian':
        raise Exception("Only have gaussian logistic for scipy")

    # train the classifier
    gauss_log = sm_api.GLM(target,
                           train,
                           family=sm_api.families.Gaussian(
                               sm_api.families.links.log))
    start = time.time()
    gauss_log_results = gauss_log.fit()
    print "sm_api.GLM took", time.time() - start, "seconds"
    print gauss_log_results.summary()
Beispiel #56
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else 0.999
            q = h2o.nodes[0].quantiles(
                source_key=hex_key,
                column=0,
                interpolation_type=7,
                quantile=quantile,
                max_qbins=MAX_QBINS,
                multiple_pass=2,
            )
            qresult = q["result"]
            qresult_single = q["result_single"]
            qresult_iterations = q["iterations"]
            qresult_interpolated = q["interpolated"]
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?",
            )

            # only one column
            column = summaryResult["summaries"][0]

            colname = column["colname"]

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Beispiel #57
0
    def test_summary2_exp(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)),
            (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None,
                                                  None)),
            (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)),
            (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None,
                                           None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:",
                             h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )