Ejemplo n.º 1
0
    def test_summary2_exp(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)),
            (100, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)),
            (1000, 1, "x.hex", -5000, 0, ("C1", None, None, None, None, None)),
            (10000, 1, "x.hex", -100000, 100000, ("C1", None, None, None, None, None)),
            (100000, 1, "x.hex", -1, 1, ("C1", None, None, None, None, None)),
            (1000000, 1, "A.hex", 1, 100, ("C1", None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(
                csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE
            )
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult["summaries"][0]
            colname = column["colname"]
            coltype = column["type"]
            nacnt = column["nacnt"]
            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
            pctile = stats["pctile"]
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != "" and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )
Ejemplo n.º 2
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5*ROWS, 1, 'x.hex', 1, 20000,        ['C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5*ROWS, 1, 'x.hex', -5000, 0,        ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1*ROWS, 1, 'x.hex', -1, 1,           ['C1',  -1.05, -0.48, 0.0087, 0.50, 1.00]),

            (1*ROWS, 1, 'A.hex', 1, 100,          ['C1',   1.05, 26.00, 51.00, 76.00, 100.0]),
            (1*ROWS, 1, 'A.hex', -99, 99,         ['C1',  -99, -50.0, 0, 50.00, 99]),

            (1*ROWS, 1, 'B.hex', 1, 10000,        ['C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1*ROWS, 1, 'B.hex', -100, 100,       ['C1',  -100.10, -50.0, 0.85, 51.7, 100,00]),

            (1*ROWS, 1, 'C.hex', 1, 100000,       ['C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1*ROWS, 1, 'C.hex', -101, 101,       ['C1',  -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [ co.base, len(co.bins), len(co.data), co.domain,
                co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles,
                co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr  = "[%s]" % ",".join(map(str,probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(
                algo='quantile',
                model_id=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][0] # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                    )
            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 3
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]
            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 4
0
    def test_summary2_percentile2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values
            (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values
            (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            legalValues = {}
            for x in range(expectedMin, expectedMax):
                legalValues[x] = x
        
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1)
            if h2o.verbose:
                print "summaryResult:", h2o.dump_json(summaryResult)

            summaries = summaryResult['summaries']
            scipyCol = 0
            for column in summaries:
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    e = .1 * rowCount
                    self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, 
                        msg="Bins not right. b: %s e: %s" % (b, e))

                print "pctile:", pctile
                print "maxs:", maxs
                self.assertEqual(maxs[0], expectedMax)
                print "mins:", mins
                self.assertEqual(mins[0], expectedMin)

                for v in pctile:
                    self.assertTrue(v >= expectedMin, 
                        "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin))
                    self.assertTrue(v <= expectedMax, 
                        "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax))
            
                eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0]
                if expectedMin==1:
                    eV = eV1
                elif expectedMin==0:
                    eV = [e-1 for e in eV1]
                elif expectedMin==2:
                    eV = [e+1 for e in eV1]
                else:
                    raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin)

            trial += 1

            # if colname!='' and expected[scipyCol]:
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    )
            scipyCol += 1
Ejemplo n.º 5
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
            # parse setup error ? supposedly fixed now
            # (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]),
            (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]),
            (10000, 1, 'x.hex', -100000, 100000,
             ['C1', None, None, None, None, None]),
            (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]),
            (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None,
                                           None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxErr = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxErr = 1.05 * maxErr

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            # column 0 not used here
            assert len(expected) == 6
            co = h2o_cmd.runSummary(key=hex_key,
                                    column=0,
                                    expected=expected[1:],
                                    maxDelta=maxErr)
            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "maxErr", maxErr
            if co.label != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.99,
                    h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9],

                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                )
Ejemplo n.º 6
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            ('cars.csv', 'c.hex', [
                (None, None,None,None,None,None),
                ('economy (mpg)', None,None,None,None,None),
                ('cylinders', None,None,None,None,None),
            ],
            ),
            ('runifA.csv', 'A.hex', [
                (None,  1.00, 25.00, 50.00, 75.00, 100.0),
                ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
            ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            ('runif.csv', 'x.hex', [
                (None,  1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
            ],
            ),
            ('runifB.csv', 'B.hex', [
                (None,  1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                ('x', -100.00, -50.1, 0.974, 51.7, 100,00),
            ],
            ),

            ('runifC.csv', 'C.hex', [
                (None,  1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
            ],
            ),
        ]


        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname,
                schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):",  h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
                    pctile = stats['pctile']


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0
                    

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname!='' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                        if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'):
                            raise Exception("stopping to look")
                                


                scipyCol += 1

            trial += 1
    def test_summary2_uniform_int_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, "B.hex", 1, 1000 * M, ("C1", 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)),
            (ROWS, 1, "B.hex", 1, 1000, ("C1", 1.0, 250.0, 500.0, 750.0, 1000.0)),
            (ROWS, 1, "x.hex", 1, 20000, ("C1", 1.0, 5000.0, 10000.0, 15000.0, 20000.0)),
            (ROWS, 1, "x.hex", -5000, 0, ("C1", -5000.00, -3750.0, -2500.0, -1250.0, 0)),
            (ROWS, 1, "x.hex", -100000, 100000, ("C1", -100000.0, -50000.0, 0, 50000.0, 100000.0)),
            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),
            (ROWS, 1, "B.hex", 1, 10000, ("C1", 1.0, 2501.0, 5001.0, 7501.0, 10000.0)),
            (ROWS, 1, "B.hex", -100, 100, ("C1", -100.0, -50.0, 0.0, 50.0, 100.0)),
            (ROWS, 1, "C.hex", 1, 100000, ("C1", 1.0, 25001.0, 50001.0, 75001.0, 100000.0)),
            # (ROWS, 1, 'C.hex', -101, 101,          ('C1',  -101, -51, -1, 49.0, 100.0)),
        ]
        if not DO_REAL:
            # only 3 integer values!
            tryList.append((1000000, 1, "x.hex", -1, 1, ("C1", -1.0, -1, 0.000, 1, 1.00)))

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?

            maxDelta = (expectedMax - expectedMin) / (MAX_QBINS + 0.0)
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * 0.01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=60, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult["summaries"][0]
            colname = column["colname"]
            self.assertEqual(colname, expected[0])

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            h2o_util.assertApproxEqual(
                pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
            )
            h2o_util.assertApproxEqual(
                pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
            )
            h2o_util.assertApproxEqual(
                pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
            )

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, rowCount / len(hcnt), delta=0.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 8
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (5,     1, 'x.hex', 1, 20000,         ['C1', None, None, None, None, None]),
            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
            (1000000, 1, 'A.hex', 1, 100,         ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, 
                rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct= [0.001, 0.001, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            if expected[1] and expected[5]:
                expectedRange = expected[5] - expected[1]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0


            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
Ejemplo n.º 9
0
    def test_summary2_percentile2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (500000, 2, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values
            (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values
            (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        for (rowCount, colCount, hex_key, timeoutSecs, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            legalValues = {}
            for x in range(expectedMin, expectedMax):
                legalValues[x] = x
        
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, cols=0, max_ncols=1)
            if h2o.verbose:
                print "summaryResult:", h2o.dump_json(summaryResult)

            summaries = summaryResult['summaries']
            scipyCol = 0
            for column in summaries:
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    e = .1 * rowCount
                    self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, 
                        msg="Bins not right. b: %s e: %s" % (b, e))

                print "pctile:", pctile
                print "maxs:", maxs
                self.assertEqual(maxs[0], expectedMax)
                print "mins:", mins
                self.assertEqual(mins[0], expectedMin)

                for v in pctile:
                    self.assertTrue(v >= expectedMin, 
                        "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin))
                    self.assertTrue(v <= expectedMax, 
                        "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax))
            
                eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0]
                if expectedMin==1:
                    eV = eV1
                elif expectedMin==0:
                    eV = [e-1 for e in eV1]
                elif expectedMin==2:
                    eV = [e+1 for e in eV1]
                else:
                    raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin)

            trial += 1

            # if colname!='' and expected[scipyCol]:
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    )
            scipyCol += 1
Ejemplo n.º 10
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,
             ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,
             ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0,
             ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]),
            (ROWS, 1, 'A.hex', 1.0, 100.0,
             ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,
             ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]),
            (ROWS, 1, 'B.hex', 1.0, 10000.0,
             ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
            (ROWS, 1, 'C.hex', 1.0, 100000.0,
             ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount,
                                                       colCount, expectedMin,
                                                       expectedMax,
                                                       SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange / (MAX_QBINS - 2)
            maxDelta = 1 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight?
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=column['colname'],
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2,
                                       interpolation_type=7)  # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       rel=.00001,
                                       msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       rel=.00001,
                                       msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance,
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance,
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance,
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 11
0
    def test_summary2_uniform_int_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'B.hex', 1, 1000 * M,
             ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)),
            (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0,
                                         1000.0)),
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0,
                                          20000.0)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0,
                                          -1250.0, 0)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0,
                                                 50000.0, 100000.0)),

            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0,
                                          10000.0)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0,
                                           100.0)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0,
                                           75001.0, 100000.0)),
            # (ROWS, 1, 'C.hex', -101, 101,          ('C1',  -101, -51, -1, 49.0, 100.0)),
        ]
        if not DO_REAL:
            # only 3 integer values!
            tryList.append(\
                (1000000, 1, 'x.hex', -1, 1,              ('C1',  -1.0, -1, 0.000, 1, 1.00)) \
                )

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0))
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=60,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 12
0
    def test_summary2_percentile(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 1, 'cD', 300),
            (100000, 2, 'cE', 300),
        ]

        timeoutSecs = 10
        trial = 1
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print 'Trial:', trial
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            legalValues = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} # set. http://docs.python.org/2/library/stdtypes.html#set
            expectedMin = min(legalValues)
            expectedMax = max(legalValues)
            expectedUnique = (expectedMax - expectedMin) + 1
            mode = 0.5 # rounding to nearest int will shift us from this for expected mean
            expectedMean = 0.5
            expectedSigma = 0.5
            write_syn_dataset(csvPathname, rowCount, colCount, 
                low=expectedMin, high=expectedMax, mode=mode,
                SEED=SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename('.', csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            if h2o.verbose:
                print "summaryResult:", h2o.dump_json(summaryResult)

            summaries = summaryResult['summaries']
            scipyCol = 0
            for column in summaries:
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hbrk:
                    self.assertIn(int(b), legalValues)
                self.assertEqual(len(hbrk), len(legalValues))

                # self.assertAlmostEqual(hcnt[0], 0.5 * rowCount, delta=.01*rowCount)
                # self.assertAlmostEqual(hcnt[1], 0.5 * rowCount, delta=.01*rowCount)

                print "pctile:", pctile
                print "maxs:", maxs
                # we round to int, so we may introduce up to 0.5 rounding error? compared to "mode" target
                self.assertAlmostEqual(maxs[0], expectedMax, delta=0.01)
                print "mins:", mins
                self.assertAlmostEqual(mins[0], expectedMin, delta=0.01)

                for v in pctile:
                    self.assertTrue(v >= expectedMin,
                        "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin))
                    self.assertTrue(v <= expectedMax,
                        "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax))

                eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0]
                if expectedMin==1:
                    eV = eV1
                elif expectedMin==0:
                    eV = [e-1 for e in eV1]
                elif expectedMin==2:
                    eV = [e+1 for e in eV1]
                else:
                    raise Exception("Test doesn't have the expected percentileValues for expectedMin: %s" % expectedMin)

                if colname!='':
                    # don't do for enums
                    # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=scipyCol,
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        # h2oQuantilesExact=qresult,
                        )

                scipyCol += 1
Ejemplo n.º 13
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,        ['C1',  0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,        ['C1',  -5000.0, -3750.0, -2550.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1',  -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0,           ['C1',  -1.0, -0.50, 0.0, 0.50, 1.0]),

            (ROWS, 1, 'A.hex', 1.0, 100.0,          ['C1',   1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,         ['C1',  -99.0, -50.0, 0.0, 50.0, 99.0]),

            (ROWS, 1, 'B.hex', 1.0, 10000.0,        ['C1',   1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),

            (ROWS, 1, 'C.hex', 1.0, 100000.0,       ['C1',   1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, 
                expectedMin, expectedMax, SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange/(MAX_QBINS-2)
            maxDelta = 0.5 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight? 
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta 

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, 
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, 
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, 
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1


            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                    )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 14
0
    def test_summary_stepping(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
            # parse setup error
            # (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,     1, 'x.hex', 1, 20000,         ['C1', None, None, None, None, None]),
            (5,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', .4900, .5000,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', -.5000, -.4900,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', 490, 500,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', -500, -490,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', 49000, 50000,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', -50000, -49000,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', 4900, 5000,        ['C1', None, None, None, None, None]),
            (5,   1, 'x.hex', -5000, -4900,        ['C1', None, None, None, None, None]),
            (5,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
            (5, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
            (5, 1, 'x.hex', 1, 100,         ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, rangeMin, rangeMax, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            # add 5% for fp errors?
            maxErr = ((expectedMax - expectedMin)/1000) * 1.05

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            # column 0 not used here
            assert len(expected) == 6
            co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr)

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "maxErr", maxErr
            if co.label!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.99,
                    h2oSummary2=co.percentiles[5 if DO_MEDIAN else 9],

                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
Ejemplo n.º 15
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
# parse setup error
#            (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
#            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
#            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
#            (1000000, 1, 'A.hex', 1, 100,          ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k,v in column.iteritems():
                    setattr(self, k, v) # achieves self.k = v

        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, 
                rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', 
                hex_key=hex_key, timeoutSecs=30, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
            print "\n" + csvFilename
            # column 0?
            summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1')
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # default_pctiles
            # isText
            # rows
            # off
            # key
            # checksum

            # only one column
            columns = summaryResult['frames'][0]['columns']
            default_pctiles = summaryResult['frames'][0]['default_pctiles']
            co = Column(columns[0])
            # how are enums binned. Stride of 1? (what about domain values)
            coList = [
                co.base,
                len(co.bins),
                len(co.data),
                co.domain,
                co.label,
                co.maxs,
                co.mean,
                co.mins,
                co.missing,
                co.ninfs,
                co.pctiles,
                co.pinfs,
                co.precision,
                co.sigma,
                co.str_data,
                co.stride,
                co.type,
                co.zeros,
                ]

            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)

            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            pctiles = [0] + co.pctiles + [0]
            
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(co.label, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            if expected[1] and expected[5]:
                expectedRange = expected[5] - expected[1]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
            print "co.label:", co.label, "co.maxs (2 places):", mx
            print "co.label:", co.label, "co.mins (2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "h2oSummary2MaxErr", maxErr
            if co.label!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctiles[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
Ejemplo n.º 16
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            (
                'cars.csv',
                'c.hex',
                [
                    (None, None, None, None, None, None),
                    ('economy (mpg)', None, None, None, None, None),
                    ('cylinders', None, None, None, None, None),
                ],
            ),
            (
                'runifA.csv',
                'A.hex',
                [
                    (None, 1.00, 25.00, 50.00, 75.00, 100.0),
                    ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
                ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            (
                'runif.csv',
                'x.hex',
                [
                    (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                    ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                    ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                    ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
                ],
            ),
            (
                'runifB.csv',
                'B.hex',
                [
                    (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                    ('x', -100.00, -50.1, 0.974, 51.7, 100, 00),
                ],
            ),
            (
                'runifC.csv',
                'C.hex',
                [
                    (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                    ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
                ],
            ),
        ]

        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata',
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname,
                                     expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(
                    source_key=hex_key,
                    column=column['colname'],
                    quantile=quantile,
                    max_qbins=MAX_QBINS,
                    multiple_pass=2,
                    interpolation_type=7)  # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:",
                               q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype = stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype != 'Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                        mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                        sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct = [
                        0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9,
                        0.95, 0.99
                    ]
                    pctile = stats['pctile']

                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange / (MAX_QBINS - 2)
                    maxErr = 0.5 * expectedBin  # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(
                        mins[0],
                        expected[1],
                        tol=maxErr,
                        msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(
                        pctile[3],
                        expected[2],
                        tol=maxErr,
                        msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(
                        pctile[5],
                        expected[3],
                        tol=maxErr,
                        msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(
                        pctile[7],
                        expected[4],
                        tol=maxErr,
                        msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(
                        maxs[0],
                        expected[5],
                        tol=maxErr,
                        msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype != 'Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname != '' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                        )

                        if False and h2o_util.approxEqual(pctile[5],
                                                          0.990238116744,
                                                          tol=0.002,
                                                          msg='stop here'):
                            raise Exception("stopping to look")

                scipyCol += 1

            trial += 1
Ejemplo n.º 17
0
    def test_quant_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if getpass.getuser() == 'kevin':
            tryList = [
                (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300),
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None,
                 'cE', 300),
            ]
        else:
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None,
                 'cE', 300),
            ]

        # h2b.browseTheCloud()
        trial = 0
        for (bucket, csvPathname, iColCount, oColCount, hex_key,
             timeoutSecs) in tryList:
            xList = []
            eList = []
            fList = []

            # PARSE*******************************************************
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=200,
                                           doSummary=False)
            csvPathnameFull = h2i.find_folder_and_filename(bucket,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            if not oColCount:
                iColCount = 0

            if not oColCount:
                oColCount = numCols

            colCount = iColCount + oColCount
            for i in range(0, numCols):
                print "Column", i, "summary"
                h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i)

            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            # print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)
                print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict

            # start after the last input col
            levels = h2o.nodes[0].levels(source=hex_key)
            l = levels['levels']
            for column in range(iColCount, iColCount + oColCount):
                if l[column]:
                    print "Skipping", column, "because it's enum (says levels)"
                    continue

                # QUANTILE*******************************************************

                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                start = time.time()
                # file has headers. use col index
                q = h2o.nodes[0].quantiles(source_key=hex_key,
                                           column=column,
                                           quantile=quantile,
                                           max_qbins=MAX_QBINS,
                                           multiple_pass=1)
                qresult = q['result']
                h2p.red_print("result:", q['result'], "quantile", quantile,
                              "interpolated:", q['interpolated'], "iterations",
                              q['iterations'])
                elapsed = time.time() - start
                print "quantile end on ", hex_key, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                if 1 == 1:
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=column,  # what col to extract from the csv
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        # h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        h2oQuantilesExact=qresult,
                        use_genfromtxt=True,
                    )

                trial += 1
                execTime = 0
                xList.append(column)
                eList.append(execTime)
                fList.append(quantileTime)

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on took", elapsed, 'seconds.'

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'column (0 is first)'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
Ejemplo n.º 18
0
    def test_summary2_exp(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)),
            (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None,
                                                  None)),
            (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)),
            (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None,
                                           None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:",
                             h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )
Ejemplo n.º 19
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5 * ROWS, 1, 'x.hex', 1, 20000,
             ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5 * ROWS, 1, 'x.hex', -5000, 0,
             ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1 * ROWS, 1, 'x.hex', -100000, 100000,
             ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1 * ROWS, 1, 'x.hex', -1, 1,
             ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]),
            (1 * ROWS, 1, 'A.hex', 1, 100,
             ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]),
            (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]),
            (1 * ROWS, 1, 'B.hex', 1, 10000,
             ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1 * ROWS, 1, 'B.hex', -100, 100,
             ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]),
            (1 * ROWS, 1, 'C.hex', 1, 100000,
             ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1 * ROWS, 1, 'C.hex', -101, 101,
             ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [
                co.base,
                len(co.bins),
                len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins,
                co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision,
                co.sigma, co.str_data, co.stride, co.type, co.zeros
            ]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(
                co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr = "[%s]" % ",".join(map(str, probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(algo='quantile',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][
                0]  # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                )
            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 20
0
    def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', None, None, None, None, None)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList:
            h2o.beta_features = False

            if pathPrefix:
                csvPathname = pathPrefix + "/" + csvFilename
            else:
                csvPathname = csvFilename

            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
            if skipHeader:
                header = 1
            else:
                header = 0
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
                schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            # okay to get more cols than we want
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0])

                quantile = 0.5 if DO_MEDIAN else OTHER_Q
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2)
                qresult = q['result']
                qresult_single = q['result_single']
                qresult_iterations = q['iterations']
                qresult_interpolated = q['interpolated']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
                h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
                print h2o.dump_json(q)

                self.assertLess(qresult_iterations, 16, 
                    msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    pctile = stats['pctile']

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    ## ignore for blank colnames, issues with quoted numbers
                    # covtype is too big to do in scipy
                    if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data':
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=skipHeader, # important!!
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else OTHER_Q,
                            h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            )


                scipyCol += 1

            trial += 1
Ejemplo n.º 21
0
    def test_summary2_small(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 100, 'x.hex', [-1, 0,
                                  1], ('C1', None, None, 0, None, None)),
            (None, 1000, 'x.hex', [-1, 0,
                                   1], ('C1', None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values,
                              SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS,
                                               timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=0,
                                       interpolation_type=7,
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg=
                "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?"
            )

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       numRows / len(hcnt),
                                       delta=1 + .01 * numRows,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Ejemplo n.º 22
0
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0,
                                            15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                            -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                   1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                         1.00)),
            (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                          100.0)),
            (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                            7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                             100, 00)),
            (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                             75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18,
                                             49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999
            ]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount,
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i != 0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (
                        hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" %
                                    h2o.dump_json(resultExec))
                    h2p.blue_print(
                        "\nthreshold: %.2f Exec quantile: %s Summary2: %s" %
                        (threshold, result, pt[i]))
                    if not result:
                        raise Exception(
                            "exec result: %s for quantile: %s is bad" %
                            (result, threshold))
                    h2o_util.assertApproxEqual(
                        result,
                        pctile[i],
                        tol=maxDelta,
                        msg=
                        'exec percentile: %s too different from expected: %s' %
                        (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1 == 0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2')
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols, 1)
                    self.assertEqual(numRows, len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 23
0
    def test_quant_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

    
        if getpass.getuser()=='kevin':
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), 
                (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), 
                ]
        else:
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), 
                ]

        # h2b.browseTheCloud()
        trial = 0
        for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            xList = []
            eList = []
            fList = []

            # PARSE*******************************************************
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False)
            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            if not oColCount:
                iColCount = 0

            if not oColCount:
                oColCount = numCols

            colCount = iColCount + oColCount
            for i in range (0,numCols):
                print "Column", i, "summary"
                h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i);

            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)
                print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict
            
            # start after the last input col
            levels = h2o.nodes[0].levels(source=hex_key);
            l = levels['levels']
            for column in range(iColCount, iColCount+oColCount):
                if l[column]:
                    print "Skipping", column, "because it's enum (says levels)"
                    continue

                # QUANTILE*******************************************************
                
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                start = time.time()
                # file has headers. use col index
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1)
                qresult = q['result']
                h2p.red_print("result:", q['result'], "quantile", quantile, 
                    "interpolated:", q['interpolated'], "iterations", q['iterations'])
                elapsed = time.time() - start
                print "quantile end on ", hex_key, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                if 1==0:
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=column, # what col to extract from the csv
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        # h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        h2oQuantilesExact=qresult,
                        use_genfromtxt=True,
                        )

                trial += 1
                execTime = 0
                xList.append(column)
                eList.append(execTime)
                fList.append(quantileTime)

                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on took", elapsed, 'seconds.'

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'column (0 is first)'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
Ejemplo n.º 24
0
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (100000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (100000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (100000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (100000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) 
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i!=0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                    h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i]))
                    if not result:
                        raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold))
                    h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, 
                        msg='exec percentile: %s too different from expected: %s' % (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1==0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2') 
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols,1)
                    self.assertEqual(numRows,len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                    )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 25
0
    def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList:

            if pathPrefix:
                csvPathname = pathPrefix + "/" + csvFilename
            else:
                csvPathname = csvFilename

            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
            if skipHeader:
                header = 1
            else:
                header = 0
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
                schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0])

                quantile = 0.5 if DO_MEDIAN else OTHER_Q
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2)
                qresult = q['result']
                qresult_single = q['result_single']
                qresult_iterations = q['iterations']
                qresult_interpolated = q['interpolated']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
                h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
                print h2o.dump_json(q)

                self.assertLess(qresult_iterations, 16, 
                    msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    pctile = stats['pctile']

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected')


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?
                    # hack?
                    maxErr = maxErr * 2
                    print "maxErr:", maxErr

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    ## ignore for blank colnames, issues with quoted numbers
                    # covtype is too big to do in scipy
                    if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data':
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=skipHeader, # important!!
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else OTHER_Q,
                            h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                scipyCol += 1

            trial += 1
Ejemplo n.º 26
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else 0.999
            q = h2o.nodes[0].quantiles(
                source_key=hex_key,
                column=0,
                interpolation_type=7,
                quantile=quantile,
                max_qbins=MAX_QBINS,
                multiple_pass=2,
            )
            qresult = q["result"]
            qresult_single = q["result_single"]
            qresult_iterations = q["iterations"]
            qresult_interpolated = q["interpolated"]
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?",
            )

            # only one column
            column = summaryResult["summaries"][0]

            colname = column["colname"]

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Ejemplo n.º 27
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
# parse setup error
#            (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
#            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
#            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
#            (1000000, 1, 'A.hex', 1, 100,          ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxErr = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxErr = 1.05 * maxErr

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            # column 0 not used here
            assert len(expected) == 6
            co = h2o_cmd.runSummary(key=hex_key, column=0, expected=expected[1:], maxDelta=maxErr)
            print co.label, co.type, co.missing, co.domain, sum(co.bins)

            # default_pctiles
            # isText
            # rows
            # off
            # key
            # checksum

            # touch all that should be there
            coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing,
                co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]

            for k,v in co:
                print k, v

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "maxErr", maxErr
            if co.label!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=co.pctiles[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )